**Loading the dataset using Kaggle API**

In [1]:
!pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d itachi9604/disease-symptom-description-dataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading disease-symptom-description-dataset.zip to /content
  0% 0.00/30.1k [00:00<?, ?B/s]
100% 30.1k/30.1k [00:00<00:00, 7.37MB/s]


**Unzipping the dataset**

In [2]:
!unzip disease-symptom-description-dataset.zip

Archive:  disease-symptom-description-dataset.zip
  inflating: Symptom-severity.csv    
  inflating: dataset.csv             
  inflating: symptom_Description.csv  
  inflating: symptom_precaution.csv  


**Loading the dataset**

In [51]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import  metrics   #Additional scklearn functions
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection 
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve
import lightgbm as lgb
from imblearn.under_sampling import RandomUnderSampler
import plotly.express as px
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline

**Reading the dataset**

In [4]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


**REMOVING '_' FROM DATASET**

In [5]:
for col in df.columns:
    df[col] = df[col].str.replace('_', ' ')
    
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin rash,dischromic patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,


**Stripping the values**

In [6]:
for col in df.columns:
  df[col] = df[col].str.strip()

**Checking for null values**

In [7]:
df.isnull().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

**filling null values with 0**

In [8]:
df = df.fillna(0)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,itching,skin rash,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,itching,skin rash,nodal skin eruptions,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Exploring the weights associated with symptons severity**

In [9]:
df_severity = pd.read_csv('Symptom-severity.csv')
df_severity['Symptom'] = df_severity['Symptom'].str.replace('_',' ')
df_severity.head(10)

Unnamed: 0,Symptom,weight
0,itching,1
1,skin rash,3
2,nodal skin eruptions,4
3,continuous sneezing,4
4,shivering,5
5,chills,3
6,joint pain,3
7,stomach pain,5
8,acidity,3
9,ulcers on tongue,4


In [10]:
df_severity['Symptom'].unique()

array(['itching', 'skin rash', 'nodal skin eruptions',
       'continuous sneezing', 'shivering', 'chills', 'joint pain',
       'stomach pain', 'acidity', 'ulcers on tongue', 'muscle wasting',
       'vomiting', 'burning micturition', 'spotting urination', 'fatigue',
       'weight gain', 'anxiety', 'cold hands and feets', 'mood swings',
       'weight loss', 'restlessness', 'lethargy', 'patches in throat',
       'irregular sugar level', 'cough', 'high fever', 'sunken eyes',
       'breathlessness', 'sweating', 'dehydration', 'indigestion',
       'headache', 'yellowish skin', 'dark urine', 'nausea',
       'loss of appetite', 'pain behind the eyes', 'back pain',
       'constipation', 'abdominal pain', 'diarrhoea', 'mild fever',
       'yellow urine', 'yellowing of eyes', 'acute liver failure',
       'fluid overload', 'swelling of stomach', 'swelled lymph nodes',
       'malaise', 'blurred and distorted vision', 'phlegm',
       'throat irritation', 'redness of eyes', 'sinus pressu

**Preprocessing the data and putting the weights associated with each datset in the dataset frame**

In [11]:
vals = df.values
symptoms = df_severity['Symptom'].unique()

for i in range(len(symptoms)):
    vals[vals == symptoms[i]] = df_severity[df_severity['Symptom'] == symptoms[i]]['weight'].values[0]

In [12]:
vals

array([['Fungal infection', 1, 3, ..., 0, 0, 0],
       ['Fungal infection', 3, 4, ..., 0, 0, 0],
       ['Fungal infection', 1, 4, ..., 0, 0, 0],
       ...,
       ['Urinary tract infection', 6, 4, ..., 0, 0, 0],
       ['Psoriasis', 3, 3, ..., 0, 0, 0],
       ['Impetigo', 3, 7, ..., 0, 0, 0]], dtype=object)

In [13]:
df_processed = pd.DataFrame(vals,columns=df.columns)
df_processed.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,1,3,4,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,3,4,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,4,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,3,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Checking for text which didnt conver to numbers**

In [14]:
print(df['Symptom_3'].unique())
print(df['Symptom_4'].unique())
print(df['Symptom_5'].unique())


[4 'dischromic  patches' 3 5 6 7 2 'foul smell of urine']
['dischromic  patches' 0 4 5 6 'spotting  urination' 2 3 7]
[0 4 7 'spotting  urination' 5 3 6 2]


**Assigning them 0 weights**

In [15]:
df_processed = df_processed.replace('dischromic  patches', 0)
df_processed = df_processed.replace('spotting  urination', 0)
df_processed = df_processed.replace('foul smell of urine', 0)

**Print all the unique weights associated with each Symptom**

In [25]:
for i in df_processed.columns[1:]:
  print(i,sep="\t",end=" ")
  print(df_processed[i].unique())

Symptom_1 [1 3 4 5 6 7 2]
Symptom_2 [3 4 5 6 7 2 0]
Symptom_3 [4 0 3 5 6 7 2]
Symptom_4 [0 4 5 6 2 3 7]
Symptom_5 [0 4 7 5 3 6 2]
Symptom_6 [0 7 4 5 3 6 2]
Symptom_7 [0 4 3 2 6 5]
Symptom_8 [0 4 5 2 3 6 7]
Symptom_9 [0 5 4 3 6 7 2]
Symptom_10 [0 4 6 3 5 2]
Symptom_11 [0 3 6 4 2 5 7]
Symptom_12 [0 2 6 7 4 5]
Symptom_13 [0 3 2 6 5]
Symptom_14 [0 3 5 7]
Symptom_15 [0 7 5 3]
Symptom_16 [0 5 3 2]
Symptom_17 [0 2]


**Total number of samples in dataframe**

In [26]:
print("Total number of samples : ",df_processed.shape[0])

Total number of samples :  4920


**Number of null values**

In [27]:
print("Total number of null samples : ",df_processed.isna().sum().sum())

Total number of null samples :  0


**Show coorelation**

In [28]:
fig = px.imshow(df_processed.corr(),text_auto=True)
fig.show()

**Using stratified cross validation for model training to avoid chances of overfitting**

In [29]:
class config:
    NUM_FOLDS = 10
    SEED = 541
    TARGET = 'Disease'
    JUNK = -1

def create_folds(data):    
    data["kfold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=config.NUM_FOLDS, shuffle=True, random_state=config.SEED)
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data[config.TARGET].values)):
        data.loc[v_, 'kfold'] = f    
    
    return data

In [30]:
def retrieve_data(train_folds,fold):


  train_data = train_folds[train_folds['kfold'] != fold].reset_index(drop=True)
  valid_data = train_folds[train_folds['kfold'] == fold].reset_index(drop=True)

  x_train = train_data.drop(['kfold', config.TARGET], axis=1)
  cols = x_train.columns
  x_train = x_train.values
  y_train = train_data[config.TARGET].values
      
  x_valid = valid_data.drop(['kfold', config.TARGET], axis=1).values
  y_valid = valid_data[config.TARGET].values


  return x_train,x_valid,y_train,y_valid

In [31]:
train1 = df_processed
train_folds = create_folds(train1)

**XGBOOST CLASSIFIER**

In [32]:
max = 0
xgb_dummy = None
for fold in range(config.NUM_FOLDS):
  
  x_train,x_valid,y_train,y_valid = retrieve_data(train_folds,fold)
  xgb = XGBClassifier(booster =  'gbtree',
                        n_jobs = 4,
                        objective = 'binary:logistic',
                        silent = 0,
                      subsample = 1,
                      learning_rate = 0.01,
                      max_depth = 4,
                      min_child_weight = 2,
                      colsample_bytree = 1,
                      n_estimators= 200,
                       reg_lambda= 1)

  xgb.fit(x_train, y_train)


  y_pred_train = xgb.predict(x_train)  #np.where(yhat_train[:,1] >= best_thresh,1,0)
  y_pred_test = xgb.predict(x_valid) #np.where(yhat_test[:,1] >= best_thresh,1,0)
  
  if(accuracy_score(y_valid,y_pred_test) > max):
    max = accuracy_score(y_valid,y_pred_test)
    xgb_dummy = xgb


  print("Training accuracy : ",accuracy_score(y_train,y_pred_train))
  print("Testing accuracy : ",accuracy_score(y_valid,y_pred_test))
  # #print("F1_Score : ",f1_score(y_valid, y_pred_test))
  # print("Precision score : ",precision_score(y_valid, y_pred_test))
  # print("Recall score : ",recall_score(y_valid, y_pred_test))
  print(classification_report(y_valid, y_pred_test))
  print(confusion_matrix(y_valid, y_pred_test))

  print("\n\n-----------------------------------------------------------------------",end="\n\n")

Training accuracy :  0.9916440831074977
Testing accuracy :  0.9898373983739838
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        12
                                   AIDS       1.00      1.00      1.00        12
                                   Acne       1.00      1.00      1.00        12
                    Alcoholic hepatitis       1.00      1.00      1.00        12
                                Allergy       1.00      0.75      0.86        12
                              Arthritis       1.00      1.00      1.00        12
                       Bronchial Asthma       1.00      1.00      1.00        12
                   Cervical spondylosis       1.00      1.00      1.00        12
                            Chicken pox       1.00      1.00      1.00        12
                    Chronic cholestasis       1.00      1.00      1.00        12
                            C

In [34]:
xgb_dummy.save_model("symptoms_xgb.h5")

**Logistic Regression**

In [52]:
max = 0
lr_dummy = None
for fold in range(config.NUM_FOLDS):
  
  x_train,x_valid,y_train,y_valid = retrieve_data(train_folds,fold)
  lr = LogisticRegression(multi_class='ovr',max_iter=400)

  lr.fit(x_train, y_train)


  y_pred_train = lr.predict(x_train)  #np.where(yhat_train[:,1] >= best_thresh,1,0)
  y_pred_test = lr.predict(x_valid) #np.where(yhat_test[:,1] >= best_thresh,1,0)
  
  if(accuracy_score(y_valid,y_pred_test) > max):
    max = accuracy_score(y_valid,y_pred_test)
    lr_dummy = lr


  print("Training accuracy : ",accuracy_score(y_train,y_pred_train))
  print("Testing accuracy : ",accuracy_score(y_valid,y_pred_test))
  # #print("F1_Score : ",f1_score(y_valid, y_pred_test))
  # print("Precision score : ",precision_score(y_valid, y_pred_test))
  # print("Recall score : ",recall_score(y_valid, y_pred_test))
  print(classification_report(y_valid, y_pred_test))
  print(confusion_matrix(y_valid, y_pred_test))

  print("\n\n-----------------------------------------------------------------------",end="\n\n")

Training accuracy :  0.9024390243902439
Testing accuracy :  0.9024390243902439
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       0.75      1.00      0.86        12
                                   AIDS       0.79      0.92      0.85        12
                                   Acne       1.00      1.00      1.00        12
                    Alcoholic hepatitis       1.00      0.92      0.96        12
                                Allergy       0.56      0.75      0.64        12
                              Arthritis       1.00      0.83      0.91        12
                       Bronchial Asthma       0.92      1.00      0.96        12
                   Cervical spondylosis       1.00      0.67      0.80        12
                            Chicken pox       1.00      1.00      1.00        12
                    Chronic cholestasis       0.75      1.00      0.86        12
                            C

In [53]:
# save the model to disk
import pickle
filename = 'lr_model.pkl'
pickle.dump(lr_dummy, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)

**Decision Tree**

In [49]:
max = 0
dt_dummy = None
for fold in range(config.NUM_FOLDS):
  
  x_train,x_valid,y_train,y_valid = retrieve_data(train_folds,fold)
  dt = DecisionTreeClassifier(max_depth=14)

  dt.fit(x_train, y_train)


  y_pred_train = dt.predict(x_train)  #np.where(yhat_train[:,1] >= best_thresh,1,0)
  y_pred_test = dt.predict(x_valid) #np.where(yhat_test[:,1] >= best_thresh,1,0)
  
  if(accuracy_score(y_valid,y_pred_test) > max):
    max = accuracy_score(y_valid,y_pred_test)
    dt_dummy = dt


  print("Training accuracy : ",accuracy_score(y_train,y_pred_train))
  print("Testing accuracy : ",accuracy_score(y_valid,y_pred_test))
  print(classification_report(y_valid, y_pred_test))
  print(confusion_matrix(y_valid, y_pred_test))

  print("\n\n-----------------------------------------------------------------------",end="\n\n")

Training accuracy :  0.974706413730804
Testing accuracy :  0.9715447154471545
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        12
                                   AIDS       1.00      1.00      1.00        12
                                   Acne       1.00      1.00      1.00        12
                    Alcoholic hepatitis       1.00      1.00      1.00        12
                                Allergy       1.00      0.75      0.86        12
                              Arthritis       1.00      1.00      1.00        12
                       Bronchial Asthma       1.00      1.00      1.00        12
                   Cervical spondylosis       1.00      1.00      1.00        12
                            Chicken pox       1.00      1.00      1.00        12
                    Chronic cholestasis       1.00      1.00      1.00        12
                            Co

In [50]:
filename = 'dt_model.pkl'
pickle.dump(dt_dummy, open(filename, 'wb'))

**Random Forest**

In [60]:
max = 0
rf_dummy = None
for fold in range(config.NUM_FOLDS):
  
  x_train,x_valid,y_train,y_valid = retrieve_data(train_folds,fold)
  rf = RandomForestClassifier(n_estimators=400,max_depth=9)

  rf.fit(x_train, y_train)


  y_pred_train = rf.predict(x_train)  #np.where(yhat_train[:,1] >= best_thresh,1,0)
  y_pred_test = rf.predict(x_valid) #np.where(yhat_test[:,1] >= best_thresh,1,0)
  
  if(accuracy_score(y_valid,y_pred_test) > max):
    max = accuracy_score(y_valid,y_pred_test)
    rf_dummy = rf


  print("Training accuracy : ",accuracy_score(y_train,y_pred_train))
  print("Testing accuracy : ",accuracy_score(y_valid,y_pred_test))
  print(classification_report(y_valid, y_pred_test))
  print(confusion_matrix(y_valid, y_pred_test))

  print("\n\n-----------------------------------------------------------------------",end="\n\n")

Training accuracy :  0.9853206865401988
Testing accuracy :  0.9857723577235772
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        12
                                   AIDS       1.00      1.00      1.00        12
                                   Acne       1.00      1.00      1.00        12
                    Alcoholic hepatitis       1.00      1.00      1.00        12
                                Allergy       1.00      0.75      0.86        12
                              Arthritis       1.00      1.00      1.00        12
                       Bronchial Asthma       0.92      1.00      0.96        12
                   Cervical spondylosis       1.00      0.92      0.96        12
                            Chicken pox       1.00      1.00      1.00        12
                    Chronic cholestasis       1.00      1.00      1.00        12
                            C

In [61]:
filename = 'rf_model.pkl'
pickle.dump(rf_dummy, open(filename, 'wb'))