## Loading the dataset

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, log_loss, confusion_matrix, make_scorer
from sklearn.svm import SVC
import joblib

In [3]:
# Load dataset
df = pd.read_csv("Symptom2DiseaseNew.csv", encoding="latin1")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Preview first few rows
print(df.head())

                                                text                  label
0  Suddenly, in the middle of a lecture, I suffer...           Hypertension
1  Because of the vomiting and diarrhea, I've had...                Typhoid
2  I suffer from constipation, and it's difficult...  Dimorphic Hemorrhoids
3  My legs cause me a lot of discomfort when I ex...         Varicose Veins
4  The stomach pains have been intense and freque...                Typhoid


## Check for Missing Data & Data Types

In [4]:
print(df.info())       # Check data types
print(df.isnull().sum())  # Check missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7200 entries, 0 to 7199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7200 non-null   object
 1   label   7200 non-null   object
dtypes: object(2)
memory usage: 112.6+ KB
None
text     0
label    0
dtype: int64


## Encode the Labels
Machine learning models work with numeric labels.

In [5]:
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Mapping for later use
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{'Acne': np.int64(0), 'Arthritis': np.int64(1), 'Bronchial Asthma': np.int64(2), 'Cervical spondylosis': np.int64(3), 'Chicken pox': np.int64(4), 'Common Cold': np.int64(5), 'Dengue': np.int64(6), 'Dimorphic Hemorrhoids': np.int64(7), 'Fungal infection': np.int64(8), 'Hypertension': np.int64(9), 'Impetigo': np.int64(10), 'Jaundice': np.int64(11), 'Malaria': np.int64(12), 'Migraine': np.int64(13), 'Pneumonia': np.int64(14), 'Psoriasis': np.int64(15), 'Typhoid': np.int64(16), 'Varicose Veins': np.int64(17), 'allergy': np.int64(18), 'diabetes': np.int64(19), 'drug reaction': np.int64(20), 'gastroesophageal reflux disease': np.int64(21), 'peptic ulcer disease': np.int64(22), 'urinary tract infection': np.int64(23)}


## Split Data into Train/Test Sets

In [6]:
X = df['text']  # Input (complaints)
y = df['label_encoded']  # Output (encoded labels)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Convert Text to Numerical Vectors (TF-IDF)

We’ll use TF-IDF for converting text into features.

In [7]:
vectorizer = TfidfVectorizer(
    stop_words = 'english',  # remove common words
    max_features = 5000      # limit features for efficiency
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## Train an SVC Classification Model

In [8]:
svc_model = SVC(C=10, kernel = 'rbf', gamma = 'scale', probability=True, random_state=42)
svc_model.fit(X_train_tfidf, y_train)

y_pred_svm = svc_model.predict(X_test_tfidf)
y_pred_proba_svm = svc_model.predict_proba(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Test set log loss:", log_loss(y_test, y_pred_proba_svm))

Accuracy: 0.9986111111111111
Test set log loss: 0.043543808695563374


In [9]:
print(confusion_matrix(y_test, y_pred_svm))

[[60  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 60  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 60  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 60  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 60  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 60  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 60  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 60  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 60  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 59  0  0  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 60  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 60  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 60  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0

In [10]:
print("Classification Report:\n", classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

Classification Report:
                                  precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00        60
                      Arthritis       1.00      1.00      1.00        60
               Bronchial Asthma       1.00      1.00      1.00        60
           Cervical spondylosis       1.00      1.00      1.00        60
                    Chicken pox       1.00      1.00      1.00        60
                    Common Cold       1.00      1.00      1.00        60
                         Dengue       0.98      1.00      0.99        60
          Dimorphic Hemorrhoids       1.00      1.00      1.00        60
               Fungal infection       1.00      1.00      1.00        60
                   Hypertension       1.00      0.98      0.99        60
                       Impetigo       1.00      1.00      1.00        60
                       Jaundice       1.00      1.00      1.00        60
                        Ma

## SAVE OBJECTS FOR DEPLOYMENT

In [18]:
joblib.dump(svc_model, "svc_model.pkl")            # Model
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")    # Vectorizer
joblib.dump(label_encoder, "label_encoder.pkl")    # Label Encoder
joblib.dump(label_mapping, "label_mapping.pkl")    # Label mapping dictionary

['label_mapping.pkl']