In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
import os

In [29]:
try:
    symptom_data = pd.read_csv('../data/disease_symptom.csv') #change to '../data/' if needed.
    drug_data = pd.read_csv('../data/disease_drug.csv') #change to '../data/' if needed.
except FileNotFoundError as e:
    print(f"Error: {e}")
    raise e #or use return, but raise will stop execution.

In [30]:
if "Disease" not in symptom_data.columns or "Disease" not in drug_data.columns:
    print("Error: 'Disease' column missing from one or both CSV files.")
    raise Exception("Disease column missing")

symptom_diseases = set(symptom_data["Disease"].unique())
drug_diseases = set(drug_data["Disease"].unique())

if not symptom_diseases.issubset(drug_diseases):
    print("Error: there are diseases in the symptom file that are not in the drug file.")
    raise Exception("disease mismatch")

In [31]:
symptoms = pd.get_dummies(symptom_data[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']].stack()).groupby(level=0).sum()
diseases = symptom_data['Disease']

drugs = drug_data['Drug Name']

label_encoder = LabelEncoder()
encoded_diseases = label_encoder.fit_transform(diseases)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(symptoms, encoded_diseases, test_size=0.2, random_state=42)
symptom_columns = symptoms.columns

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Accuracy: 0.973
Classification Report:
                 precision    recall  f1-score   support

   Acid Reflux       1.00      1.00      1.00        97
   Alzheimer's       1.00      1.00      1.00        87
        Anemia       0.91      0.96      0.93        90
     Arthritis       1.00      1.00      1.00       105
        Asthma       0.94      0.82      0.88       108
    Bronchitis       0.82      0.94      0.88        94
      COVID-19       0.89      1.00      0.94        77
    Depression       1.00      1.00      1.00        96
      Diabetes       1.00      1.00      1.00       104
      Epilepsy       1.00      1.00      1.00        92
           Flu       1.00      0.90      0.95        98
 Heart Disease       0.94      0.91      0.92        98
  Hypertension       0.96      0.94      0.95       101
Hypothyroidism       1.00      1.00      1.00        98
 Kidney Stones       1.00      1.00      1.00       110
 Liver Disease       1.00      1.00      1.00       106
      M

In [33]:
if not os.path.exists('models'):
    os.makedirs('models')

with open('../models/symptom_disease_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('../models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
with open('../models/drugs.pkl', 'wb') as f:
    pickle.dump(drugs, f)
with open('../models/symptom_columns.pkl', 'wb') as f:
    pickle.dump(symptom_columns, f)

In [34]:
# Sample Input and Confidence Score
sample_input_symptoms = ['persistent_cough', 'weight_loss', 'night_sweats', 'fatigue'] #Example symptoms
sample_input = pd.DataFrame([dict(zip(symptom_columns, [1 if symptom in sample_input_symptoms else 0 for symptom in symptom_columns]))])

# Load the model and label encoder
with open('../models/symptom_disease_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
with open('../models/label_encoder.pkl', 'rb') as f:
    loaded_label_encoder = pickle.load(f)

In [35]:
# Predict and get probabilities
predicted_encoded = loaded_model.predict(sample_input)
predicted_disease = loaded_label_encoder.inverse_transform(predicted_encoded)[0]
probabilities = loaded_model.predict_proba(sample_input)[0]

# Get the confidence score for the predicted disease
confidence_score = probabilities[predicted_encoded[0]] * 100

print(f"\nSample Input Symptoms: {sample_input_symptoms}")
print(f"Predicted Disease: {predicted_disease}")
print(f"Confidence Score: {confidence_score:.2f}%")


Sample Input Symptoms: ['persistent_cough', 'weight_loss', 'night_sweats', 'fatigue']
Predicted Disease: Arthritis
Confidence Score: 19.00%
