In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import pickle
import os
dataset = pd.read_csv('datasets/Training.csv')
print(f"Dataset shape: {dataset.shape}")

np.random.seed(42)
dataset['Age'] = np.random.randint(1, 80, size=len(dataset))
dataset['Gender'] = np.random.choice(['Male', 'Female'], size=len(dataset))
dataset['Region'] = np.random.choice(['Urban', 'Rural'], size=len(dataset))

plt.figure(figsize=(15, 8))
disease_counts = dataset['prognosis'].value_counts().sort_values(ascending=True)
disease_counts.plot(kind='barh', color='skyblue')
plt.title('Disease Frequency Distribution')
plt.xlabel('Count')
plt.ylabel('Disease')
plt.tight_layout()
plt.savefig('visualizations/disease_distribution.png')
plt.close()

plt.figure(figsize=(20, 15))
symptoms_to_plot = dataset.iloc[:, :30].corr()
sns.heatmap(symptoms_to_plot, cmap='coolwarm', center=0)
plt.title('Symptom Correlation Heatmap (Top 30 Symptoms)')
plt.tight_layout()
plt.savefig('visualizations/symptom_correlation.png')
plt.close()

symptom_columns = [col for col in dataset.columns if col not in ['prognosis', 'Age', 'Gender', 'Region']]
dataset[symptom_columns] = dataset[symptom_columns].apply(pd.to_numeric, errors='coerce')

plt.figure(figsize=(12, 8))
avg_symptoms = dataset.groupby('prognosis')[symptom_columns].sum().mean(axis=1).sort_values()
avg_symptoms.plot(kind='barh', color='lightgreen')
plt.title('Average Number of Symptoms per Disease')
plt.xlabel('Average Symptom Count')
plt.ylabel('Disease')
plt.tight_layout()
plt.savefig(os.path.join('visualizations', 'avg_symptoms.png'))
plt.close()

fig = px.box(dataset, x='prognosis', y='Age', 
             title='Age Distribution by Disease',
             color='Gender')
fig.write_html('visualizations/age_distribution.html')

X = dataset.drop(['prognosis', 'Age', 'Gender', 'Region'], axis=1)
y = dataset['prognosis']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

models = {
    'SVC': SVC(kernel='linear', probability=True),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'K-Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Multinomial NB': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'XGBoost': XGBClassifier(random_state=42)
}

results = []
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    pickle.dump(model, open(f'models/{name.lower().replace(" ", "_")}.pkl', 'wb'))
    
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=le.classes_, output_dict=True)
    
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })
    
    print(f"{name} Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

results_df = pd.DataFrame(results)
results_df.to_csv('model_results.csv', index=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Accuracy', data=results_df, palette='viridis')
plt.xticks(rotation=45)
plt.title('Model Accuracy Comparison')
plt.ylim(0.9, 1.05)
plt.tight_layout()
plt.savefig('visualizations/model_comparison.png')
plt.close()

sym_des = pd.read_csv("datasets/symtoms_df.csv")
precautions = pd.read_csv("datasets/precautions_df.csv")
workout = pd.read_csv("datasets/workout_df.csv")
description = pd.read_csv("datasets/description.csv")
medications = pd.read_csv('datasets/medications.csv')
diets = pd.read_csv("datasets/diets.csv")
symptom_severity = pd.read_csv("datasets/Symptom-severity.csv")
severity_dict = dict(zip(symptom_severity['Symptom'], symptom_severity['weight']))

severity_map = {
    'Fungal infection': 'Mild',
    'Allergy': 'Mild',
    'GERD': 'Moderate',
    'Chronic cholestasis': 'Severe',
}

def enhanced_helper(dis):
    desc = description[description['Disease'] == dis]['Description'].values[0]
    severity = severity_map.get(dis, 'Moderate')
    
    pre = precautions[precautions['Disease'] == dis][['Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']].values[0]
    
    meds = medications[medications['Disease'] == dis]['Medication'].values
    med_list = [f"{med} (consult doctor for dosage)" for med in meds]
    
    diet_rec = diets[diets['Disease'] == dis]['Diet'].values
    
    workouts = workout[workout['disease'] == dis]['workout'].values
    
    if severity == 'Severe':
        doc_rec = "Immediate consultation with a specialist required"
    else:
        doc_rec = "Consult physician if symptoms persist for more than 3 days"
    
    return {
        'description': desc,
        'severity': severity,
        'precautions': pre,
        'medications': med_list,
        'diets': diet_rec,
        'workouts': workouts,
        'doctor_recommendation': doc_rec,
        'follow_up': '7 days' if severity == 'Mild' else '3 days'
    }

symptoms_dict = {symptom: idx for idx, symptom in enumerate(X.columns)}
diseases_list = {idx: disease for idx, disease in enumerate(le.classes_)}

def predict_with_context(patient_symptoms, patient_info=None):
    input_vector = np.zeros(len(symptoms_dict))
    severity_score = 0
    severe_symptoms = []

    for symptom in patient_symptoms:
        if symptom in symptoms_dict:
            input_vector[symptoms_dict[symptom]] = 1
            if symptom in severity_dict:
                severity_score += severity_dict[symptom]
                if severity_dict[symptom] >= 5: 
                    severe_symptoms.append(symptom)
    
    predictions = {}
    for name, model in models.items():
        model = pickle.load(open(f'models/{name.lower().replace(" ", "_")}.pkl', 'rb'))
        pred_idx = model.predict([input_vector])[0]
        predictions[name] = diseases_list[pred_idx]
    
    consensus = max(set(predictions.values()), key=list(predictions.values()).count)
    recommendations = enhanced_helper(consensus)
    
    recommendations.update({
        'severity_score': severity_score,
        'severe_symptoms': severe_symptoms,
        'urgency_level': 'High' if severity_score >= 15 else 'Medium' if severity_score >= 8 else 'Low'
    })
    
    if patient_info:
        recommendations.update({
            'patient_name': patient_info.get('name', ''),
            'patient_age': patient_info.get('age', ''),
            'patient_gender': patient_info.get('gender', '')
        })
    
    return {
        'consensus_prediction': consensus,
        'all_predictions': predictions,
        'recommendations': recommendations,
        'symptoms_entered': patient_symptoms
    }
test_symptoms = ['itching', 'skin_rash', 'nodal_skin_eruptions']
patient_info = {'name': 'John Doe', 'age': 35, 'gender': 'Male'}
result = predict_with_context(test_symptoms, patient_info)
print("\n Prediction Result:")
print(result)

Dataset shape: (4920, 133)

Training SVC...
SVC Accuracy: 1.00
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        32
                                   AIDS       1.00      1.00      1.00        39
                                   Acne       1.00      1.00      1.00        41
                    Alcoholic hepatitis       1.00      1.00      1.00        36
                                Allergy       1.00      1.00      1.00        35
                              Arthritis       1.00      1.00      1.00        36
                       Bronchial Asthma       1.00      1.00      1.00        44
                   Cervical spondylosis       1.00      1.00      1.00        32
                            Chicken pox       1.00      1.00      1.00        35
                    Chronic cholestasis       1.00      1.00      1.00        30
                            Common Cold      




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.



X does not have valid feature names, but SVC was fitted with feature names


X does not have valid feature names, but RandomForestClassifier was fitted with feature names


X does not have valid feature names, but GradientBoostingClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


X does not have valid feature names, but MultinomialNB was fitted with feature names


X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names


X does not have valid feature names, but LogisticRegression was fitted with feature names




 Prediction Result:
{'consensus_prediction': 'Fungal infection', 'all_predictions': {'SVC': 'Fungal infection', 'Random Forest': 'Fungal infection', 'Gradient Boosting': 'Fungal infection', 'K-Neighbors': 'Fungal infection', 'Multinomial NB': 'Fungal infection', 'Decision Tree': 'Fungal infection', 'Logistic Regression': 'Fungal infection', 'XGBoost': 'Fungal infection'}, 'recommendations': {'description': 'Fungal infection is a common skin condition caused by fungi.', 'severity': 'Mild', 'precautions': array(['bath twice', 'use detol or neem in bathing water',
       'keep infected area dry', 'use clean cloths'], dtype=object), 'medications': ["['Antifungal Cream', 'Fluconazole', 'Terbinafine', 'Clotrimazole', 'Ketoconazole'] (consult doctor for dosage)"], 'diets': array(["['Antifungal Diet', 'Probiotics', 'Garlic', 'Coconut oil', 'Turmeric']"],
      dtype=object), 'workouts': array(['Avoid sugary foods', 'Consume probiotics',
       'Increase intake of garlic', 'Include yogurt in d