In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [138]:
# Cleaning the data

df = pd.read_csv('/content/drive/MyDrive/assignment1c_data/dataset.csv')
df = df.fillna('nan')

labels = df['Disease']


# Removing whitespaces from the symptoms
df = df.apply(lambda x: x.str.replace(" ","") if x.dtype == "object" else x)
df['Disease'] = labels
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic_patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic_patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic_patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic_patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [139]:
# Getting the list of all possible symptoms
symptoms = pd.read_csv('/content/drive/MyDrive/assignment1c_data/Symptom-severity.csv')
symptoms = symptoms.drop(['weight'],axis = 1)
symptoms = symptoms.apply(lambda x: x.str.replace(" ","") if x.dtype == "object" else x)
print(symptoms.head())

pickle.dump(symptoms, open("./symptoms.pkl","wb"))

# Using label binarizer to convert symptoms list into a binary array of length
lb = preprocessing.LabelBinarizer()
lb.fit(symptoms)
# print(list(symptoms['Symptom']))

                Symptom
0               itching
1             skin_rash
2  nodal_skin_eruptions
3   continuous_sneezing
4             shivering


LabelBinarizer()

In [140]:
x_vec = []
y_vec = []
for index, row in df.iterrows():
    res = [x for x in row if (x != 'nan')]
    y_vec.append(res[0])
    res = res[1:]
    x = sum(lb.transform(res)) # Gives binary array of size 132
    x_vec.append(x)


In [141]:
# Splitting the dataset into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x_vec, y_vec, shuffle=True, train_size = 0.85)

In [142]:
dt_clf_gini = DecisionTreeClassifier()
dt_clf_gini.fit(x_train, y_train)
y_pred_gini = dt_clf_gini.predict(x_test)

In [143]:
print ("Desicion Tree using Gini Index\nAccuracy is", accuracy_score(y_test,y_pred_gini)*100,"%")


Desicion Tree using Gini Index
Accuracy is 100.0 %


In [144]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
predictions = dt_clf_gini.predict(x_test)


print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

import pickle
pickle.dump(dt_clf_gini, open("./model.pkl","wb"))
pickle.dump(lb, open("./lb.pkl","wb"))


                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        19
                                   AIDS       1.00      1.00      1.00        10
                                   Acne       1.00      1.00      1.00        13
                    Alcoholic hepatitis       1.00      1.00      1.00        21
                                Allergy       1.00      1.00      1.00        10
                              Arthritis       1.00      1.00      1.00        17
                       Bronchial Asthma       1.00      1.00      1.00        11
                   Cervical spondylosis       1.00      1.00      1.00        23
                            Chicken pox       1.00      1.00      1.00        16
                    Chronic cholestasis       1.00      1.00      1.00        15
                            Common Cold       1.00      1.00      1.00        23
                           

In [145]:
# Testing some symptoms
y = lb.transform(['dischromic_patches','itching','skin_rash','nodal_skin_eruptions'])
y = sum(y)

In [146]:
dt_clf_gini.predict([y])

array(['Fungal infection'], dtype='<U39')