In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../data/symptom_disease_10k_realistic.csv")

In [None]:
df.drop_duplicates()

In [None]:
df['Symptom1'] = df['Symptom1'].str.lower().str.replace(" ", "", regex=True)
df['Symptom2'] = df['Symptom2'].str.lower().str.replace(" ", "", regex=True)
df['Symptom3'] = df['Symptom3'].str.lower().str.replace(" ", "", regex=True)
df['Symptom4'] = df['Symptom4'].str.lower().str.replace(" ", "", regex=True)

In [None]:
X = df.drop("Disease", axis=1)
y = df['Disease']

In [None]:
def impute_missing_symptoms(X_train, X_test):
    categorical_columns = ["Symptom1", "Symptom2"]
    symptom3_column = ["Symptom3"]
    symptom4_column = ["Symptom4"]

    for col in categorical_columns + symptom3_column + symptom4_column:
        if col not in X_train.columns:
            raise ValueError(f"Column '{col}' not found in X_train.")
        if col not in X_test.columns:
            raise ValueError(f"Column '{col}' not found in X_test.")

    categorical_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
    symptom3_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
    symptom4_imputer = SimpleImputer(strategy="constant", fill_value="unknown")

    imputer = ColumnTransformer([
        ("categorical_imputer", categorical_imputer, categorical_columns),
        ("symptom3_imputer", symptom3_imputer, symptom3_column),
        ("symptom4_imputer", symptom4_imputer, symptom4_column)
    ])

    filled_X_train = imputer.fit_transform(X_train)
    filled_X_test = imputer.transform(X_test)

    filled_X_train = pd.DataFrame(filled_X_train, columns=X_train.columns)
    filled_X_test = pd.DataFrame(filled_X_test, columns=X_test.columns)

    return filled_X_train, filled_X_test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
filled_X_train, filled_X_test = impute_missing_symptoms(X_train, X_test)

In [None]:
categorical_features = ["Symptom1", "Symptom2", "Symptom3", "Symptom4"]
one_hot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_X_train = transformer.fit_transform(X_train)
transformed_X_test = transformer.transform(X_test)

transformed_X_train = pd.DataFrame(transformed_X_train, columns=transformer.get_feature_names_out())
transformed_X_test = pd.DataFrame(transformed_X_test, columns=transformer.get_feature_names_out())

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(transformed_X_train, y_train)

In [None]:
y_pred = clf.predict(transformed_X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print("\nScore Report:")
print(f"Accuracy: {accuracy:.4f}")

Classification Report:
                precision    recall  f1-score   support

   Acid Reflux       1.00      1.00      1.00        97
   Alzheimer's       1.00      1.00      1.00        87
        Anemia       0.90      0.94      0.92        90
     Arthritis       1.00      1.00      1.00       105
        Asthma       0.85      0.91      0.88       108
    Bronchitis       0.89      0.82      0.85        94
      COVID-19       0.90      0.95      0.92        77
    Depression       1.00      1.00      1.00        96
      Diabetes       1.00      1.00      1.00       104
      Epilepsy       1.00      1.00      1.00        92
           Flu       0.96      0.92      0.94        98
 Heart Disease       0.94      0.91      0.92        98
  Hypertension       0.95      0.94      0.95       101
Hypothyroidism       1.00      1.00      1.00        98
 Kidney Stones       1.00      1.00      1.00       110
 Liver Disease       1.00      1.00      1.00       106
      Migraine       1.0

In [None]:
model_data = {'model': clf, 'transformer': transformer}
with open('../models/train_model.pkl', 'wb') as file:
    pickle.dump(model_data, file)

In [None]:
symptom1 = 'fever'
symptom2 = 'cough'
symptom3 = 'headache'
symptom4 = 'lossoftaste'

input_data = pd.DataFrame({
    'Symptom1': [symptom1],
    'Symptom2': [symptom2],
    'Symptom3': [symptom3],
    'Symptom4': [symptom4]
})

In [None]:
with open('../models/train_model.pkl', 'rb') as file:
    model_data = pickle.load(file)

clf = model_data['model']
transformer = model_data['transformer']

In [None]:
try:
    transformed_input = transformer.transform(input_data)
    transformed_input_df = pd.DataFrame(transformed_input, columns=transformer.get_feature_names_out())
except ValueError as e:
    print(f"Error transforming input: {e}")
    print("Please ensure input symptom values match those used during training.")
    exit()

In [None]:
try:
    prediction = clf.predict(transformed_input_df)
    predicted_disease = prediction[0]

    probabilities = clf.predict_proba(transformed_input_df)
    confidence = probabilities[0][clf.classes_.tolist().index(predicted_disease)]

except Exception as e:
    print(f"Error making prediction: {e}")
    print("Please check the model and input data.")
    exit()

In [None]:
print("Symptom Summary:")
print(f"- Symptom 1: {symptom1}")
print(f"- Symptom 2: {symptom2}")
print(f"- Symptom 3: {symptom3}")
print(f"- Symptom 4: {symptom4}")
print(f"\nPredicted Disease: {predicted_disease}")
print(f"Prediction Confidence: {confidence:.4f}")

print("\nDisclaimer:")
print("This prediction is based on limited symptom data and should not be used for medical diagnosis.")
print("Consult a healthcare professional for accurate diagnosis and treatment.")

Symptom Summary:
- Symptom 1: fever
- Symptom 2: cough
- Symptom 3: headache
- Symptom 4: lossoftaste

Predicted Disease: COVID-19
Prediction Confidence: 0.8218

Disclaimer:
This prediction is based on limited symptom data and should not be used for medical diagnosis.
Consult a healthcare professional for accurate diagnosis and treatment.
