In [1]:
import pandas as pd
import numpy as np

import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../data/DiseaseAndSymptoms.csv")
df.sample(5)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
375,Acne,skin_rash,pus_filled_pimples,blackheads,,,,,,,,,,,,,,
3581,Jaundice,itching,vomiting,fatigue,weight_loss,high_fever,yellowish_skin,dark_urine,abdominal_pain,,,,,,,,,
221,Hepatitis D,vomiting,fatigue,yellowish_skin,dark_urine,nausea,loss_of_appetite,abdominal_pain,yellowing_of_eyes,,,,,,,,,
97,Bronchial Asthma,fatigue,cough,high_fever,breathlessness,family_history,mucoid_sputum,,,,,,,,,,,
2772,Tuberculosis,chills,vomiting,fatigue,weight_loss,cough,high_fever,breathlessness,sweating,loss_of_appetite,mild_fever,yellowing_of_eyes,swelled_lymph_nodes,malaise,phlegm,chest_pain,blood_in_sputum,


In [3]:
# Create Combined Symptoms Column
df['symptoms'] = df[['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5']].apply(lambda x: ','.join(x.dropna().astype(str).str.strip()), axis=1)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,symptoms
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic_patches,,,,,,,,,,,,,,"itching,skin_rash,nodal_skin_eruptions,dischro..."
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic_patches,,,,,,,,,,,,,,,"skin_rash,nodal_skin_eruptions,dischromic_patches"
2,Fungal infection,itching,nodal_skin_eruptions,dischromic_patches,,,,,,,,,,,,,,,"itching,nodal_skin_eruptions,dischromic_patches"
3,Fungal infection,itching,skin_rash,dischromic_patches,,,,,,,,,,,,,,,"itching,skin_rash,dischromic_patches"
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,"itching,skin_rash,nodal_skin_eruptions"


In [4]:
df = df[['Disease', 'symptoms']]
df.head()

Unnamed: 0,Disease,symptoms
0,Fungal infection,"itching,skin_rash,nodal_skin_eruptions,dischro..."
1,Fungal infection,"skin_rash,nodal_skin_eruptions,dischromic_patches"
2,Fungal infection,"itching,nodal_skin_eruptions,dischromic_patches"
3,Fungal infection,"itching,skin_rash,dischromic_patches"
4,Fungal infection,"itching,skin_rash,nodal_skin_eruptions"


In [5]:
# Group the data by 'Disease' and aggregate the 'symptoms' column
df = df.groupby('Disease').agg({'symptoms': lambda x: ','.join(x)}).reset_index()
df.head()

Unnamed: 0,Disease,symptoms
0,(vertigo) Paroymsal Positional Vertigo,"vomiting,headache,nausea,spinning_movements,lo..."
1,AIDS,"muscle_wasting,patches_in_throat,high_fever,ex..."
2,Acne,"skin_rash,pus_filled_pimples,blackheads,scurri..."
3,Alcoholic hepatitis,"vomiting,yellowish_skin,abdominal_pain,swellin..."
4,Allergy,"continuous_sneezing,shivering,chills,watering_..."


In [6]:
# Clean the 'symptoms' by removing all the duplicate symptoms and sorting them in alphabetical order.
df['symptoms'] = df['symptoms'].apply(lambda x: ','.join(sorted(set(x.split(',')))))
df.head()

Unnamed: 0,Disease,symptoms
0,(vertigo) Paroymsal Positional Vertigo,"headache,loss_of_balance,nausea,spinning_movem..."
1,AIDS,"extra_marital_contacts,high_fever,muscle_wasti..."
2,Acne,"blackheads,pus_filled_pimples,scurring,skin_rash"
3,Alcoholic hepatitis,"abdominal_pain,distention_of_abdomen,history_o..."
4,Allergy,"chills,continuous_sneezing,shivering,watering_..."


In [7]:
df[df['Disease'] == 'AIDS']['symptoms'].values[0]

'extra_marital_contacts,high_fever,muscle_wasting,patches_in_throat'

In [8]:
df.to_csv("../data/DiseaseAndSymptoms_Cleaned.csv", index=False)

In [9]:
# Number of unique symptoms
unique_symptoms = set()
df['symptoms'].str.split(',').apply(unique_symptoms.update)
len(unique_symptoms)

92

In [10]:
# create a binary matrix
symptom_list = list(unique_symptoms)
symptom_list.sort()
symptom_list[:5]

# Create a binary matrix
binary_matrix = np.zeros((len(df), len(symptom_list)))
for i, symptoms in enumerate(df['symptoms'].str.split(',')):
    for symptom in symptoms:
        j = symptom_list.index(symptom)
        binary_matrix[i, j] = 1

# binary_matrix[:5, :5]

In [11]:
# cReate a DataFrame from the binary matrix
binary_df = pd.DataFrame(binary_matrix, columns=symptom_list)
binary_df.insert(0, 'Disease', df['Disease'])
binary_df.sample(5)

Unnamed: 0,Disease,abdominal_pain,acidity,altered_sensorium,anxiety,back_pain,blackheads,bladder_discomfort,blister,bloody_stool,...,unsteadiness,vomiting,watering_from_eyes,weakness_in_limbs,weakness_of_one_body_side,weight_gain,weight_loss,yellow_crust_ooze,yellowing_of_eyes,yellowish_skin
24,Hyperthyroidism,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25,Hypoglycemia,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,Urinary tract infection,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,Pneumonia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,Diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [12]:
# Save the DataFrame to a CSV file
binary_df.to_csv("../data/SymptomsBinaryMatrix.csv", index=False)

In [13]:
# Train a machine learning model to predict diseases based on symptoms
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Split the data into training and testing sets
X = binary_df.drop('Disease', axis=1)
y = binary_df['Disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
clf_predictions = clf.predict(X_test)

# Evaluate the classifier
print(classification_report(y_test, clf_predictions))


                              precision    recall  f1-score   support

                     Allergy       0.00      0.00      0.00       1.0
                 Chicken pox       0.00      0.00      0.00       1.0
                 Common Cold       0.00      0.00      0.00       0.0
                   Diabetes        0.00      0.00      0.00       0.0
Dimorphic hemmorhoids(piles)       0.00      0.00      0.00       1.0
                 Hepatitis B       0.00      0.00      0.00       1.0
                 Hepatitis C       0.00      0.00      0.00       0.0
                 Hepatitis D       0.00      0.00      0.00       0.0
             Hyperthyroidism       0.00      0.00      0.00       1.0
                Hypoglycemia       0.00      0.00      0.00       1.0
                     Malaria       0.00      0.00      0.00       1.0
                     Typhoid       0.00      0.00      0.00       0.0
     Urinary tract infection       0.00      0.00      0.00       0.0
              Varic

In [14]:
# Save the trained model to a file
import joblib
joblib.dump(clf, "../models/DiseasePredictionModel.pkl")


['../models/DiseasePredictionModel.pkl']

In [15]:
# Load the trained model from a file
clf = joblib.load("../models/DiseasePredictionModel.pkl")


In [16]:
len(symptom_list)

92

In [17]:
def predict_disease(symptoms):
    # Create a binary vector for the input symptoms
    input_vector = np.zeros(X_train.shape[1])
    for symptom in symptoms:
        if symptom in symptom_list:
            input_vector[symptom_list.index(symptom)] = 1

    # Make a prediction using the trained model
    disease_probabilities = clf.predict_proba([input_vector])[0]
    disease_predictions = clf.predict([input_vector])

    # Display the predicted diseases and their probabilities
    disease_results = pd.DataFrame({'Disease': clf.classes_, 'Probability': disease_probabilities})
    disease_results.sort_values('Probability', ascending=False, inplace=True)
    disease_results.reset_index(drop=True, inplace=True)
    return disease_results

In [18]:
# Test the prediction function
symptoms = ['chills','fatigue','high_fever','joint_pain','skin_rash','vomiting']
predictions = predict_disease(symptoms)
print("Top 3 Predictions:")
for i in range(3):
    print(f"{i + 1}. {predictions['Disease'][i]} ({predictions['Probability'][i]})")

Top 3 Predictions:
1. Dengue (0.59)
2. Hepatitis E (0.08)
3. Typhoid (0.06)


In [19]:
# Test the prediction function
symptoms = ['extra_marital_contacts','high_fever','muscle_wasting','patches_in_throat']
predictions = predict_disease(symptoms)
print("Top 3 Predictions:")
for i in range(3):
    print(f"{i + 1}. {predictions['Disease'][i]} ({predictions['Probability'][i]})")

Top 3 Predictions:
1. AIDS (0.6)
2. Bronchial Asthma (0.05)
3. Impetigo (0.05)
