In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
from IPython import get_ipython
from IPython.display import display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, auc, precision_recall_curve
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectKBest, f_classif, RFE, mutual_info_classif
from sklearn.decomposition import PCA
import time
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', '#ff9896', '#98df8a']
sns.set_palette(sns.color_palette(colors))

In [4]:

def load_and_preprocess_data(file_path):
    """
    Load and preprocess the disease dataset
    """
    df = pd.read_csv(file_path)  # Load the actual dataset

    # Check for missing values
    missing_values = df.isnull().sum()
    print(f"Missing values: {missing_values.sum()}")

    # Check for class imbalance
    class_distribution = df['prognosis'].value_counts()
    print("\nClass distribution:")
    print(class_distribution)

    # Plot class distribution
    plt.figure(figsize=(14, 6))
    ax = sns.countplot(x='prognosis', data=df)
    plt.title('Disease Class Distribution', fontsize=16)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Separate features and target
    X = df.drop('prognosis', axis=1)
    y = df['prognosis']

    # Print feature characteristics for understanding binary nature
    print("\nFeature value distribution (top 5 features):")
    for col in X.columns[:5]:
        print(f"{col}: {X[col].value_counts().to_dict()}")

    return X, y, df


In [5]:

def feature_selection_with_mutual_info(X, y, top_n=30):
    """
    Select top N features using Mutual Information
    """
    mutual_info = mutual_info_classif(X, y)
    feature_scores = pd.Series(mutual_info, index=X.columns)

    # Select the top N features based on the mutual information score
    top_features = feature_scores.nlargest(top_n).index.tolist()
    print(f"Top {top_n} selected features based on Mutual Information:")
    for i, feature in enumerate(top_features, 1):
        print(f"{i}. {feature} (Score: {feature_scores[feature]:.4f})")

    # Plot feature importance
    plt.figure(figsize=(12, 8))
    feature_scores.nlargest(top_n).sort_values().plot(kind='barh')
    plt.title(f'Top {top_n} Features by Mutual Information', fontsize=16)
    plt.xlabel('Mutual Information Score', fontsize=12)
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
    plt.close()

    return X[top_features], top_features

In [6]:
def handle_class_imbalance(X, y, method='combined'):
    """
    Handle class imbalance using various techniques
    """
    if method == 'none':
        return X, y

    if method == 'smote':
        print("Applying SMOTE to handle class imbalance...")
        sampler = SMOTE(random_state=42, k_neighbors=5)
    elif method == 'adasyn':
        print("Applying ADASYN to handle class imbalance...")
        sampler = ADASYN(random_state=42, n_neighbors=5)
    elif method == 'undersample':
        print("Applying Random Undersampling to handle class imbalance...")
        sampler = RandomUnderSampler(random_state=42)
    elif method == 'combined':
        print("Applying combined sampling approach...")

        # First apply undersampling to extreme majority classes
        class_counts = pd.Series(y).value_counts()
        max_count = class_counts.max()
        min_count = class_counts.min()

        if max_count / min_count > 10:
            print("Detected extreme imbalance. First applying undersampling...")
            under_sampler = RandomUnderSampler(
                sampling_strategy={cls: min(count, max_count // 5) for cls, count in class_counts.items() if count > max_count // 2},
                random_state=42
            )
            X, y = under_sampler.fit_resample(X, y)

        print("Applying SMOTE for final balancing...")
        sampler = SMOTE(random_state=42, k_neighbors=min(5, min_count-1))

    X_resampled, y_resampled = sampler.fit_resample(X, y)
    print(f"Shape before resampling: {X.shape}")
    print(f"Shape after resampling: {X_resampled.shape}")

    # Plot the resampled class distribution
    plt.figure(figsize=(14, 6))
    sns.countplot(x=y_resampled)
    plt.title('Resampled Class Distribution', fontsize=16)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('resampled_class_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()

    return X_resampled, y_resampled

In [7]:
def create_model_list():
    """
    Create a list of models suitable for binary features with categorical targets
    """
    model_list = [
        ('Logistic Regression', LogisticRegression(C=1.0, solver='saga', penalty='l2', max_iter=2000, class_weight='balanced', random_state=42)),
        ('Random Forest', RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', bootstrap=True, class_weight='balanced', oob_score=True, random_state=42)),
        ('XGBoost', XGBClassifier(learning_rate=0.05, n_estimators=200, max_depth=6, objective='multi:softprob', subsample=0.8, colsample_bytree=0.8, gamma=0.1, min_child_weight=1, use_label_encoder=False, eval_metric='mlogloss', random_state=42)),
        ('Decision Tree', DecisionTreeClassifier(max_depth=10, min_samples_split=5, min_samples_leaf=2, criterion='entropy', class_weight='balanced', random_state=42)),
        ('SVM', SVC(C=10.0, kernel='rbf', gamma='scale', decision_function_shape='ovr', probability=True, class_weight='balanced', random_state=42)),
        ('Gradient Boosting', GradientBoostingClassifier(learning_rate=0.05, n_estimators=200, max_depth=5, min_samples_split=10, subsample=0.8, random_state=42)),
        # New models good for binary features
        ('Bernoulli NB', BernoulliNB(alpha=1.0)),  # Specifically designed for binary features
        ('KNN', KNeighborsClassifier(n_neighbors=5, weights='distance', metric='hamming')), # Hamming distance for binary features
        ('AdaBoost', AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=42)),
        ('Extra Trees', ExtraTreesClassifier(n_estimators=200, max_depth=15, min_samples_split=5, class_weight='balanced', random_state=42)),
        ('Ridge Classifier', RidgeClassifier(alpha=1.0, class_weight='balanced', random_state=42)),
        ('MLP Neural Network', MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', alpha=0.0001, max_iter=500, random_state=42)),
        ('LightGBM', LGBMClassifier(n_estimators=200, learning_rate=0.05, num_leaves=31, max_depth=-1, random_state=42, verbose=-1)),
        ('CatBoost', CatBoostClassifier(iterations=200, learning_rate=0.1, depth=6, loss_function='MultiClass', random_state=42, verbose=0)),
        ('Voting Classifier', VotingClassifier(estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
            ('xgb', XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='mlogloss', random_state=42)),
            ('lgbm', LGBMClassifier(n_estimators=100, random_state=42, verbose=-1))
        ], voting='soft', n_jobs=-1))
    ]

    return model_list


In [8]:
def train_and_evaluate_top_models(X_train, X_val, y_train, y_val, top_features, top_n=5):
    """
    Train and evaluate the top N models based on accuracy.
    """
    model_list = create_model_list()
    model_list.append(('Stacking Ensemble', create_stacking_model()))

    results = {}

    # Train and evaluate models
    for name, model in model_list:
        # Training time
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time

        # Inference time
        start_time = time.time()
        y_pred = model.predict(X_val)
        if hasattr(model, 'predict_proba'):
            y_prob = model.predict_proba(X_val)
        else:
            y_prob = np.zeros((len(y_val), len(np.unique(y_val))))
            for i, pred in enumerate(y_pred):
                y_prob[i, pred] = 1

        inference_time = time.time() - start_time

        # Evaluate metrics
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred, average='weighted')
        recall = recall_score(y_val, y_pred, average='weighted')
        f1 = f1_score(y_val, y_pred, average='weighted')

        try:
            auc_score = roc_auc_score(y_val, y_prob, multi_class='ovr', average='macro')
        except Exception as e:
            auc_score = 0.0

        results[name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc': auc_score,
            'train_time': train_time,
            'inference_time': inference_time,
            'y_pred': y_pred,
            'y_prob': y_prob,
            'model': model
        }

    # Sort results by accuracy and select the top N models
    sorted_results = sorted(results.items(), key=lambda x: x[1]['accuracy'], reverse=True)
    top_results = sorted_results[:top_n]

    # Plot the learning curves (Training vs Validation)
    plot_learning_curves_for_top_models(top_results, X_train, y_train)

    return top_results


In [9]:
def plot_learning_curves_for_top_models(top_results, X_train, y_train):
    """
    Plot the learning curves for the top N models based on accuracy.
    """
    for model_name, model_results in top_results:
        model = model_results['model']

        # Calculate learning curves
        train_sizes, train_scores, val_scores = learning_curve(
            model, X_train, y_train,
            train_sizes=np.linspace(0.1, 1.0, 5), cv=5,
            scoring='accuracy', n_jobs=-1
        )

        # Mean and std
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        val_mean = np.mean(val_scores, axis=1)
        val_std = np.std(val_scores, axis=1)

        # Plotting
        plt.figure(figsize=(10, 6))
        plt.plot(train_sizes, train_mean, 'o-', label='Training Score', color=colors[0])
        plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color=colors[0])

        plt.plot(train_sizes, val_mean, 'o-', label='Validation Score', color=colors[1])
        plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color=colors[1])

        plt.title(f'Learning Curves - {model_name}', fontsize=16)
        plt.xlabel('Training Set Size', fontsize=14)
        plt.ylabel('Accuracy', fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.legend(loc='best', fontsize=12)
        plt.tight_layout()
        plt.savefig(f'learning_curve_{model_name}.png', dpi=300, bbox_inches='tight')
        plt.close()


In [10]:
def run_disease_classification_pipeline(file_path):
    """
    Main pipeline function to run the entire disease classification workflow
    """
    print("=== Starting Disease Classification Pipeline ===")

    # Step 1: Load and preprocess data
    print("\n=== Loading and Preprocessing Data ===")
    X, y, df = load_and_preprocess_data(file_path)

    # Step 2: Feature selection
    print("\n=== Performing Feature Selection ===")
    X_selected, top_features = feature_selection_with_mutual_info(X, y, top_n=30)

    # Step 3: Handle class imbalance
    print("\n=== Handling Class Imbalance ===")
    X_balanced, y_balanced = handle_class_imbalance(X_selected, y, method='combined')

    # Step 4: Split data into train and validation sets
    print("\n=== Splitting Data into Train and Validation Sets ===")
    X_train, X_val, y_train, y_val = train_test_split(X_balanced, y_balanced, test_size=0.15, random_state=42, stratify=y_balanced)

    # Encode target labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)

    # Step 5: Train and evaluate top models
    print("\n=== Training and Evaluating Models ===")
    top_models = train_and_evaluate_top_models(X_train, X_val, y_train_encoded, y_val_encoded, top_features, top_n=5)

    return top_models


In [12]:
def create_stacking_model():
    """
    Create an optimized stacking ensemble model specifically for binary features
    """
    base_models = [
        ('lr', LogisticRegression(C=1.0, solver='saga', penalty='l2', max_iter=1000, random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', class_weight='balanced', random_state=42)),
        ('xgb', XGBClassifier(learning_rate=0.05, n_estimators=200, max_depth=6, subsample=0.8, colsample_bytree=0.8, gamma=0.1, min_child_weight=1, use_label_encoder=False, eval_metric='mlogloss', random_state=42)),
        ('bnb', BernoulliNB(alpha=1.0)),  # Adding Bernoulli NB as it's good for binary features
        ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42))
    ]

    # Meta-model (final estimator)
    meta_model = LogisticRegression(C=1.0, solver='saga', max_iter=1000, random_state=42)

    # Stacking model
    stacking_model = StackingClassifier(
        estimators=base_models,
        final_estimator=meta_model,
        cv=5,
        stack_method='predict_proba',
        n_jobs=-1,
        passthrough=True
    )

    return stacking_model

In [13]:

if __name__ == "__main__":
    file_path = '/content/drive/MyDrive/Dataset/symbipredict_2022.csv'  # Adjust path if needed
    top_models = run_disease_classification_pipeline(file_path)
    print(f"Top Models: {top_models}")


=== Starting Disease Classification Pipeline ===

=== Loading and Preprocessing Data ===
Missing values: 0

Class distribution:
prognosis
Fungal Infection                 121
Allergy                          121
GERD                             121
Chronic Cholestasis              121
Drug Reaction                    121
Peptic Ulcer Disease             121
AIDS                             121
Diabetes                         121
Gastroenteritis                  121
Bronchial Asthma                 121
Hypertension                     121
Migraine                         121
Cervical Spondylosis             121
Paralysis (brain hemorrhage)     121
Jaundice                         121
Malaria                          121
Chickenpox                       121
Dengue                           121
Typhoid                          121
Hepatitis A                      121
Hepatitis B                      121
Hepatitis C                      121
Hepatitis D                      121
Hepatitis E