In [37]:
#importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [38]:
#reading the daataset
df=pd.read_csv('dataset.csv')
df = shuffle(df,random_state=42)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
373,Acne,skin_rash,blackheads,scurring,,,,,,,,,,,,,,
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
1550,Hyperthyroidism,fatigue,mood_swings,weight_loss,restlessness,sweating,diarrhoea,fast_heart_rate,excessive_hunger,muscle_weakness,irritability,abnormal_menstruation,,,,,,
3081,AIDS,muscle_wasting,patches_in_throat,high_fever,extra_marital_contacts,,,,,,,,,,,,,
3857,Chronic cholestasis,itching,vomiting,yellowish_skin,nausea,loss_of_appetite,abdominal_pain,yellowing_of_eyes,,,,,,,,,,


In [39]:
#replacing '-' with blank space
for col in df.columns:
    df[col] = df[col].str.replace('_',' ')
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
373,Acne,skin rash,blackheads,scurring,,,,,,,,,,,,,,
4916,Acne,skin rash,pus filled pimples,blackheads,scurring,,,,,,,,,,,,,
1550,Hyperthyroidism,fatigue,mood swings,weight loss,restlessness,sweating,diarrhoea,fast heart rate,excessive hunger,muscle weakness,irritability,abnormal menstruation,,,,,,
3081,AIDS,muscle wasting,patches in throat,high fever,extra marital contacts,,,,,,,,,,,,,
3857,Chronic cholestasis,itching,vomiting,yellowish skin,nausea,loss of appetite,abdominal pain,yellowing of eyes,,,,,,,,,,


In [40]:
#getting data information
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4920 entries, 373 to 860
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     4920 non-null   object
 1   Symptom_1   4920 non-null   object
 2   Symptom_2   4920 non-null   object
 3   Symptom_3   4920 non-null   object
 4   Symptom_4   4572 non-null   object
 5   Symptom_5   3714 non-null   object
 6   Symptom_6   2934 non-null   object
 7   Symptom_7   2268 non-null   object
 8   Symptom_8   1944 non-null   object
 9   Symptom_9   1692 non-null   object
 10  Symptom_10  1512 non-null   object
 11  Symptom_11  1194 non-null   object
 12  Symptom_12  744 non-null    object
 13  Symptom_13  504 non-null    object
 14  Symptom_14  306 non-null    object
 15  Symptom_15  240 non-null    object
 16  Symptom_16  192 non-null    object
 17  Symptom_17  72 non-null     object
dtypes: object(18)
memory usage: 730.3+ KB


In [41]:
#checking for null values
df.isnull().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

In [42]:
#removing starting and trailing blank spaces
cols = df.columns
data = df[cols].values.flatten()

s = pd.Series(data)
s = s.str.strip()
s = s.values.reshape(df.shape)

df = pd.DataFrame(s, columns=df.columns)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Acne,skin rash,blackheads,scurring,,,,,,,,,,,,,,
1,Acne,skin rash,pus filled pimples,blackheads,scurring,,,,,,,,,,,,,
2,Hyperthyroidism,fatigue,mood swings,weight loss,restlessness,sweating,diarrhoea,fast heart rate,excessive hunger,muscle weakness,irritability,abnormal menstruation,,,,,,
3,AIDS,muscle wasting,patches in throat,high fever,extra marital contacts,,,,,,,,,,,,,
4,Chronic cholestasis,itching,vomiting,yellowish skin,nausea,loss of appetite,abdominal pain,yellowing of eyes,,,,,,,,,,


In [43]:
#filling empty cells with zero
df = df.fillna(0)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Acne,skin rash,blackheads,scurring,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Acne,skin rash,pus filled pimples,blackheads,scurring,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Hyperthyroidism,fatigue,mood swings,weight loss,restlessness,sweating,diarrhoea,fast heart rate,excessive hunger,muscle weakness,irritability,abnormal menstruation,0,0,0,0,0,0
3,AIDS,muscle wasting,patches in throat,high fever,extra marital contacts,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Chronic cholestasis,itching,vomiting,yellowish skin,nausea,loss of appetite,abdominal pain,yellowing of eyes,0,0,0,0,0,0,0,0,0,0


In [44]:
#reading dataset
df1 = pd.read_csv('Symptom-severity.csv')
df1['Symptom'] = df1['Symptom'].str.replace('_',' ')
df1.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin rash,3
2,nodal skin eruptions,4
3,continuous sneezing,4
4,shivering,5


In [45]:
#getting all the unique symptoms
df1['Symptom'].unique()

array(['itching', 'skin rash', 'nodal skin eruptions',
       'continuous sneezing', 'shivering', 'chills', 'joint pain',
       'stomach pain', 'acidity', 'ulcers on tongue', 'muscle wasting',
       'vomiting', 'burning micturition', 'spotting urination', 'fatigue',
       'weight gain', 'anxiety', 'cold hands and feets', 'mood swings',
       'weight loss', 'restlessness', 'lethargy', 'patches in throat',
       'irregular sugar level', 'cough', 'high fever', 'sunken eyes',
       'breathlessness', 'sweating', 'dehydration', 'indigestion',
       'headache', 'yellowish skin', 'dark urine', 'nausea',
       'loss of appetite', 'pain behind the eyes', 'back pain',
       'constipation', 'abdominal pain', 'diarrhoea', 'mild fever',
       'yellow urine', 'yellowing of eyes', 'acute liver failure',
       'fluid overload', 'swelling of stomach', 'swelled lymph nodes',
       'malaise', 'blurred and distorted vision', 'phlegm',
       'throat irritation', 'redness of eyes', 'sinus pressu

In [46]:
#replacing each symptom with repective weights
vals = df.values
symptoms = df1['Symptom'].unique()

for i in range(len(symptoms)):
    vals[vals == symptoms[i]] = df1[df1['Symptom'] == symptoms[i]]['weight'].values[0]

d = pd.DataFrame(vals, columns=cols)
d.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Acne,3,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Acne,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Hyperthyroidism,4,3,3,5,3,6,5,4,2,2,6,0,0,0,0,0,0
3,AIDS,3,6,7,5,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Chronic cholestasis,1,5,3,5,4,4,4,0,0,0,0,0,0,0,0,0,0


In [47]:
#getting the shape of new dataframe
d.shape

(4920, 18)

In [48]:
#making a copy of dataframe d to variable x and dropping the disease column
x=d.copy()
x.pop('Disease')
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Symptom_1   4920 non-null   object
 1   Symptom_2   4920 non-null   object
 2   Symptom_3   4920 non-null   object
 3   Symptom_4   4920 non-null   object
 4   Symptom_5   4920 non-null   object
 5   Symptom_6   4920 non-null   object
 6   Symptom_7   4920 non-null   object
 7   Symptom_8   4920 non-null   object
 8   Symptom_9   4920 non-null   object
 9   Symptom_10  4920 non-null   object
 10  Symptom_11  4920 non-null   object
 11  Symptom_12  4920 non-null   object
 12  Symptom_13  4920 non-null   object
 13  Symptom_14  4920 non-null   object
 14  Symptom_15  4920 non-null   object
 15  Symptom_16  4920 non-null   object
 16  Symptom_17  4920 non-null   object
dtypes: object(17)
memory usage: 653.6+ KB


In [49]:
#finding all the symptoms which are not replaced with weights
for i in x:
  w=x[i].unique()
  print(w)

[3 4 1 5 2 6 7]
[2 3 6 5 7 4 'foul smell of urine']
[2 3 7 4 5 'foul smell of urine' 6 'dischromic  patches']
[0 2 5 4 3 6 7 'dischromic  patches' 'spotting  urination']
[0 3 4 5 2 6 7 'spotting  urination']
[0 6 4 5 3 2 7]
[0 5 4 3 6 2]
[0 4 2 5 3 6 7]
[0 2 4 3 5 7 6]
[0 2 4 6 5 3]
[0 6 2 5 4 7 3]
[0 4 2 7 6 5]
[0 6 5 3 2]
[0 7 5 3]
[0 5 7 3]
[0 3 2 5]
[0 2]


In [50]:
#replacing the symptons with 0
d = d.replace('dischromic  patches', 0)
d = d.replace('spotting  urination',0)
df = d.replace('foul smell of urine',0)
df.head(10)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Acne,3,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Acne,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Hyperthyroidism,4,3,3,5,3,6,5,4,2,2,6,0,0,0,0,0,0
3,AIDS,3,6,7,5,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Chronic cholestasis,1,5,3,5,4,4,4,0,0,0,0,0,0,0,0,0,0
5,Hypertension,3,7,4,4,3,0,0,0,0,0,0,0,0,0,0,0,0
6,Hypoglycemia,5,4,4,3,3,5,5,4,4,4,2,4,0,0,0,0,0
7,Arthritis,2,4,5,2,0,0,0,0,0,0,0,0,0,0,0,0,0
8,Hepatitis B,1,4,2,3,4,4,4,4,4,6,5,2,0,0,0,0,0
9,Migraine,3,5,3,5,4,4,3,2,3,0,0,0,0,0,0,0,0


In [51]:
#checking for null values
df.isna().sum()

Disease       0
Symptom_1     0
Symptom_2     0
Symptom_3     0
Symptom_4     0
Symptom_5     0
Symptom_6     0
Symptom_7     0
Symptom_8     0
Symptom_9     0
Symptom_10    0
Symptom_11    0
Symptom_12    0
Symptom_13    0
Symptom_14    0
Symptom_15    0
Symptom_16    0
Symptom_17    0
dtype: int64

In [52]:
#splitting of dataframe into 'x' and 'y'
x=df.drop('Disease',axis=1)
y=df['Disease']
x.shape,y.shape

((4920, 17), (4920,))

In [53]:
#splitting both 'x' and 'y' into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8,random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(3936, 17) (984, 17) (3936,) (984,)


In [54]:
#implementing logistic regression
model_lr = LogisticRegression(random_state=42)
model_lr.fit(x_train, y_train)
preds_lr = model_lr.predict(x_test)
accuracy = accuracy_score(y_test, preds_lr)
precision = precision_score(y_test, preds_lr, average='macro')
recall = recall_score(y_test, preds_lr, average='macro')
f1 = f1_score(y_test, preds_lr, average='macro')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.907520325203252
Precision: 0.9075757976337528
Recall: 0.9052132896064465
F1-Score: 0.9027035185909865


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
#implementing Decision Tree
model_dt=DecisionTreeClassifier(criterion='gini',random_state=42,max_depth=13)
model_dt.fit(x_train, y_train)
preds_dt=model_dt.predict(x_test)
accuracy = accuracy_score(y_test, preds_dt)
precision = precision_score(y_test, preds_dt, average='macro')
recall = recall_score(y_test, preds_dt, average='macro')
f1 = f1_score(y_test, preds_dt, average='macro')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.9583333333333334
Precision: 0.9779234551238792
Recall: 0.9544777143352154
F1-Score: 0.957812664665753


In [57]:
#implementing Support Vector Machine
model_svm = SVC(kernel='linear', random_state=42)
model_svm.fit(x_train, y_train)
preds_svm = model_svm.predict(x_test)
accuracy = accuracy_score(y_test, preds_svm)
precision = precision_score(y_test, preds_svm, average='macro')
recall = recall_score(y_test, preds_svm, average='macro')
f1 = f1_score(y_test, preds_svm, average='macro')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.983739837398374
Precision: 0.9842733295614424
Recall: 0.9804485456703155
F1-Score: 0.9806967923774239


In [58]:
#implementing Random Forest
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(x_train, y_train)
preds_rf = model_rf.predict(x_test)
accuracy = accuracy_score(y_test, preds_rf)
precision = precision_score(y_test, preds_rf, average='macro')
recall = recall_score(y_test, preds_rf, average='macro')
f1 = f1_score(y_test, preds_rf, average='macro')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.9959349593495935
Precision: 0.9961527293844367
Recall: 0.9958147049710061
F1-Score: 0.9958380389536958


The Random Forest Classification gives the best accuracy till now. Thus for disease prediction we should go with the Random Forest Classification.