<a href="https://colab.research.google.com/github/poovarasansivakumar2003/Intellimanthan_Project_2/blob/main/Disease_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import joblib
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('dataset.csv')

In [None]:
X = df.drop(['prognosis', 'risk_level'], axis=1)
y_disease = df['prognosis']
y_risk = df['risk_level']

In [None]:
def augment_data(X, y_disease, y_risk, noise_level=0.05, n_samples=500):

    indices = np.random.choice(len(X), size=n_samples, replace=True)
    X_subset = X.iloc[indices].copy()
    y_disease_subset = y_disease.iloc[indices].copy()
    y_risk_subset = y_risk.iloc[indices].copy()

    for col in X_subset.columns:
        mask = np.random.random(len(X_subset)) < noise_level
        X_subset.loc[mask, col] = 1 - X_subset.loc[mask, col]

    X_augmented = pd.concat([X, X_subset])
    y_disease_augmented = pd.concat([y_disease, y_disease_subset])
    y_risk_augmented = pd.concat([y_risk, y_risk_subset])

    return X_augmented, y_disease_augmented, y_risk_augmented

X_aug, y_disease_aug, y_risk_aug = augment_data(X, y_disease, y_risk, noise_level=0.1, n_samples=int(len(X)*0.3))
print(f"Original data size: {len(X)}, Augmented data size: {len(X_aug)}")

Original data size: 4920, Augmented data size: 6396


In [None]:
X_train, X_test, y_disease_train, y_disease_test, y_risk_train, y_risk_test = train_test_split(
    X_aug, y_disease_aug, y_risk_aug,
    test_size=0.3,
    random_state=42,
    stratify=y_disease_aug
)

print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

Training set size: 4477, Test set size: 1919


In [None]:
rf_base = RandomForestClassifier(n_estimators=50, random_state=42)
feature_selector_disease = RFECV(
    estimator=rf_base,
    step=5,
    cv=StratifiedKFold(5, shuffle=True, random_state=42),
    scoring='accuracy',
    min_features_to_select=20
)

feature_selector_disease.fit(X_train, y_disease_train)
X_train_disease_selected = feature_selector_disease.transform(X_train)
X_test_disease_selected = feature_selector_disease.transform(X_test)

feature_selector_risk = RFECV(
    estimator=rf_base,
    step=5,
    cv=StratifiedKFold(5, shuffle=True, random_state=42),
    scoring='accuracy',
    min_features_to_select=20
)

feature_selector_risk.fit(X_train, y_risk_train)
X_train_risk_selected = feature_selector_risk.transform(X_train)
X_test_risk_selected = feature_selector_risk.transform(X_test)

selected_features_disease = X.columns[feature_selector_disease.support_]
selected_features_risk = X.columns[feature_selector_risk.support_]

print(f"Selected {len(selected_features_disease)} features for disease prediction")
print(f"Selected {len(selected_features_risk)} features for risk prediction")

Selected 122 features for disease prediction
Selected 112 features for risk prediction


In [None]:
# Disease prediction models
models_disease = {
    'logistic_regression': LogisticRegression(max_iter=1000, C=0.1, solver='saga', random_state=42),
    'decision_tree': DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=42),
    'random_forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        bootstrap=True,
        oob_score=True,
        random_state=42
    ),
    'svm': SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
}

# Risk prediction models
models_risk = {
    'logistic_regression': LogisticRegression(max_iter=1000, C=0.1, solver='saga', random_state=42),
    'decision_tree': DecisionTreeClassifier(max_depth=8, min_samples_split=5, random_state=42),
    'random_forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        bootstrap=True,
        oob_score=True,
        class_weight='balanced',
        random_state=42
    ),
    'svm': SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
}

disease_trained_models = {}
disease_cv_scores = {}

for name, model in models_disease.items():
    print(f"Training {name} for disease prediction...")
    cv_scores = cross_val_score(model, X_train_disease_selected, y_disease_train, cv=5, scoring='accuracy')
    disease_cv_scores[name] = cv_scores

    model.fit(X_train_disease_selected, y_disease_train)
    disease_trained_models[name] = model

    y_pred = model.predict(X_test_disease_selected)
    accuracy = accuracy_score(y_disease_test, y_pred)
    print(f"{name} - CV Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f}), Test Accuracy: {accuracy:.4f}")

risk_trained_models = {}
risk_cv_scores = {}

for name, model in models_risk.items():
    print(f"Training {name} for risk prediction...")
    cv_scores = cross_val_score(model, X_train_risk_selected, y_risk_train, cv=5, scoring='accuracy')
    risk_cv_scores[name] = cv_scores

    model.fit(X_train_risk_selected, y_risk_train)
    risk_trained_models[name] = model

    y_pred = model.predict(X_test_risk_selected)
    accuracy = accuracy_score(y_risk_test, y_pred)
    print(f"{name} - CV Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f}), Test Accuracy: {accuracy:.4f}")


Training logistic_regression for disease prediction...
logistic_regression - CV Accuracy: 0.9895 (±0.0015), Test Accuracy: 0.9896
Training decision_tree for disease prediction...
decision_tree - CV Accuracy: 0.7087 (±0.0300), Test Accuracy: 0.7217
Training random_forest for disease prediction...
random_forest - CV Accuracy: 0.9835 (±0.0013), Test Accuracy: 0.9786
Training svm for disease prediction...
svm - CV Accuracy: 0.9777 (±0.0030), Test Accuracy: 0.9760
Training logistic_regression for risk prediction...
logistic_regression - CV Accuracy: 0.9623 (±0.0037), Test Accuracy: 0.9594
Training decision_tree for risk prediction...
decision_tree - CV Accuracy: 0.8863 (±0.0163), Test Accuracy: 0.8801
Training random_forest for risk prediction...
random_forest - CV Accuracy: 0.9681 (±0.0037), Test Accuracy: 0.9583
Training svm for risk prediction...
svm - CV Accuracy: 0.9750 (±0.0038), Test Accuracy: 0.9745


In [None]:
def predict_ensemble(models, feature_selector, X_input):

    X_selected = feature_selector.transform(X_input)
    predictions = {}
    probabilities = {}

    for name, model in models.items():
        if hasattr(model, 'predict_proba'):
            probs = model.predict_proba(X_selected)
            pred = model.predict(X_selected)
        else:
            pred = model.predict(X_selected)
            probs = None

        predictions[name] = pred
        probabilities[name] = probs

    predictions_df = pd.DataFrame(predictions)

    final_predictions = predictions_df.mode(axis=1)[0]

    return final_predictions, probabilities

In [None]:
# EVALUATE ENSEMBLE MODELS

disease_ensemble_preds, _ = predict_ensemble(disease_trained_models, feature_selector_disease, X_test)
disease_ensemble_accuracy = accuracy_score(y_disease_test, disease_ensemble_preds)
print(f"Disease Ensemble Accuracy: {disease_ensemble_accuracy:.4f}")
print("Disease Ensemble Classification Report:")
print(classification_report(y_disease_test, disease_ensemble_preds))

risk_ensemble_preds, _ = predict_ensemble(risk_trained_models, feature_selector_risk, X_test)
risk_ensemble_accuracy = accuracy_score(y_risk_test, risk_ensemble_preds)
print(f"Risk Ensemble Accuracy: {risk_ensemble_accuracy:.4f}")
print("Risk Ensemble Classification Report:")
print(classification_report(y_risk_test, risk_ensemble_preds))
print("Risk Ensemble Confusion Matrix:")
print(confusion_matrix(y_risk_test, risk_ensemble_preds))

Disease Ensemble Accuracy: 0.9859
Disease Ensemble Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       0.94      1.00      0.97        46
                                   AIDS       0.96      0.98      0.97        49
                                   Acne       0.96      0.98      0.97        48
                    Alcoholic hepatitis       0.98      1.00      0.99        47
                                Allergy       0.98      0.98      0.98        47
                              Arthritis       1.00      0.96      0.98        46
                       Bronchial Asthma       0.98      0.98      0.98        45
                   Cervical spondylosis       0.96      1.00      0.98        49
                            Chicken pox       1.00      1.00      1.00        48
                    Chronic cholestasis       0.96      0.98      0.97        48
                            Common

In [None]:
def predict_and_recommend(symptoms, X, disease_models, risk_models, feature_selector_disease, feature_selector_risk, med_test_df):
    input_df = pd.DataFrame(0, index=[0], columns=X.columns)
    for symptom in symptoms:
        if symptom in input_df.columns:
            input_df[symptom] = 1

    predicted_disease, _ = predict_ensemble(disease_models, feature_selector_disease, input_df)
    predicted_risk, _ = predict_ensemble(risk_models, feature_selector_risk, input_df)

    recommendations = get_recommendations(predicted_disease.iloc[0], predicted_risk.iloc[0])

    result = {
        'predicted_disease': predicted_disease.iloc[0],
        'predicted_risk_level': int(predicted_risk.iloc[0]),
        'recommendations': recommendations
    }

    return result

joblib.dump(disease_trained_models, 'disease_models.pkl')
joblib.dump(risk_trained_models, 'risk_models.pkl')
joblib.dump(feature_selector_disease, 'disease_feature_selector.pkl')
joblib.dump(feature_selector_risk, 'risk_feature_selector.pkl')


['risk_feature_selector.pkl']

In [None]:
# FINAL PREDICTION AND RECOMMENDATION SYSTEM
med_test_df = pd.read_csv('medications and test recommended.csv')

def get_recommendations(disease, risk_level):
    try:
        disease = disease.strip().lower()
        med_test_df['prognosis'] = med_test_df['prognosis'].str.strip().str.lower()

        match = med_test_df[med_test_df['prognosis'] == disease]
        if match.empty:
            return {"error": f"No data found for disease '{disease}'"}

        disease_info = match.iloc[0]

        medications = eval(disease_info['Medication']) if isinstance(disease_info['Medication'], str) else disease_info['Medication']
        tests = eval(disease_info['Recommended Tests']) if isinstance(disease_info['Recommended Tests'], str) else disease_info['Recommended Tests']

        recommendations = {}

        if risk_level == 0:
            recommendations['medications'] = medications
            recommendations['tests'] = []
            recommendations['message'] = "Negligible severity. Only medication required."

        elif risk_level == 1:
            recommendations['medications'] = medications
            recommendations['tests'] = tests
            recommendations['message'] = "Medium severity. Both medication and tests recommended."

        elif risk_level == 2:
            recommendations['medications'] = []
            recommendations['tests'] = tests
            recommendations['message'] = "HIGH SEVERITY! No medication provided. Tests recommended and DOCTOR CONSULTATION IS REQUIRED."

        return recommendations

    except Exception as e:
        return {"error": f"Could not get recommendations: {str(e)}"}


In [None]:
example_symptoms = ['itching', 'skin_rash', 'nodal_skin_eruptions']
prediction = predict_and_recommend(
    example_symptoms, X,
    disease_trained_models, risk_trained_models,
    feature_selector_disease, feature_selector_risk,
    med_test_df
)

print(f"Symptoms: {example_symptoms}")
print(f"Predicted Disease: {prediction['predicted_disease']}")
print(f"Predicted Risk Level: {prediction['predicted_risk_level']}")
recommendations = prediction.get('recommendations', {})

if 'message' in recommendations:
    print(f"Message: {recommendations['message']}")

if 'error' in recommendations:
    print(f"Error: {recommendations['error']}")

medications = recommendations.get('medications', [])
if medications:
    print(f"Recommended Medications: {medications}")

tests = recommendations.get('tests', [])
if tests:
    print(f"Recommended Tests: {tests}")


Symptoms: ['itching', 'skin_rash', 'nodal_skin_eruptions']
Predicted Disease: Fungal infection
Predicted Risk Level: 0
Message: Negligible severity. Only medication required.
Recommended Medications: ['Antifungal Cream', 'Fluconazole', 'Terbinafine', 'Clotrimazole', 'Ketoconazole']
