In [75]:
import numpy as np
import pandas as pd
from tensorflow import keras 
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.models import save_model 
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns


In [76]:
data = pd.read_csv("binary-data-diseasefinder_output.csv")

In [77]:
data

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,...,Symptom_122,Symptom_123,Symptom_124,Symptom_125,Symptom_126,Symptom_127,Symptom_128,Symptom_129,Symptom_130,Symptom_131
0,Fungal infection,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4916,Acne,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
4918,Psoriasis,0,1,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,0,0


In [78]:
df = pd.DataFrame(data)

# Generate a mapping of diseases to unique float IDs
unique_diseases = df['Disease'].unique()
disease_to_id = {disease: float(i) for i, disease in enumerate(unique_diseases)}

# Replace the disease names in the DataFrame with their corresponding float IDs
df['Disease'] = df['Disease'].map(disease_to_id)

In [79]:
df

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,...,Symptom_122,Symptom_123,Symptom_124,Symptom_125,Symptom_126,Symptom_127,Symptom_128,Symptom_129,Symptom_130,Symptom_131
0,0.0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,36.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4916,37.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4917,38.0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
4918,39.0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,0,0


In [80]:
X = df
y = df.Disease

In [81]:
X = X.drop('Disease', axis=1)

In [82]:
y

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
4915    36.0
4916    37.0
4917    38.0
4918    39.0
4919    40.0
Name: Disease, Length: 4920, dtype: float64

In [83]:
X

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,...,Symptom_122,Symptom_123,Symptom_124,Symptom_125,Symptom_126,Symptom_127,Symptom_128,Symptom_129,Symptom_130,Symptom_131
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4916,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4917,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
4918,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,0,0


In [84]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.32, random_state=15)

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.32, random_state=15)

In [86]:
X_train.shape

(3345, 131)

In [87]:
tree = DecisionTreeClassifier(random_state=24)

In [88]:
tree.fit(X_train, y_train)

In [89]:
y_pred = tree.predict(X_test)
y_pred

array([22., 23., 13., ..., 15., 32., 40.])

In [90]:
X_test.shape


(1575, 131)

In [91]:
type(X_test)

pandas.core.frame.DataFrame

In [92]:
new_df = X_test.iloc[[0]].reset_index(drop=True)
new_df

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,...,Symptom_122,Symptom_123,Symptom_124,Symptom_125,Symptom_126,Symptom_127,Symptom_128,Symptom_129,Symptom_130,Symptom_131
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
X1_test = new_df

In [94]:
df.iloc[0]

Disease        0.0
Symptom_1      1.0
Symptom_2      1.0
Symptom_3      1.0
Symptom_4      1.0
              ... 
Symptom_127    0.0
Symptom_128    0.0
Symptom_129    0.0
Symptom_130    0.0
Symptom_131    0.0
Name: 0, Length: 132, dtype: float64

In [95]:
X1_test

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,...,Symptom_122,Symptom_123,Symptom_124,Symptom_125,Symptom_126,Symptom_127,Symptom_128,Symptom_129,Symptom_130,Symptom_131
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
Y_pred = tree.predict(X1_test)
Y_pred

array([22.])

In [97]:
cm = confusion_matrix(y_test, y_pred)


In [98]:
# sns.heatmap(cm, annot=True,fmt='d', cmap='YlGnBu', xticklabels=df.Disease, yticklabels=df.Disease)
# plt.ylabel('Prediction',fontsize=12)
# plt.xlabel('Actual',fontsize=12)
# plt.title('Confusion Matrix',fontsize=16)
# plt.show()

In [99]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score


In [100]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

1.0


In [101]:
precision = precision_score(y_test, y_pred, average=None)
print(precision)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [102]:
recall = recall_score(y_test, y_pred, average=None)
print(recall)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [103]:
micro_precision = precision_score(y_test, y_pred, average='micro')
print(micro_precision)

1.0


In [104]:
# import pickle
# filename = 'diseaseFinder_dt_july_2024.pkl'

In [105]:
# pickle.dump(tree, open(filename, 'wb'))


In [106]:
import pickle
pickle_out = open("diseaseFinder_dt_aug_2_2024.pkl","wb")
pickle.dump(tree, pickle_out)
pickle_out.close()