In [67]:
# Install required libraries
#%pip install numpy pandas scikit-learn statsmodels lightgbm seaborn matplotlib imbalanced-learn
#%pip install xgboost
#%pip install scipy
%pip install numpy pandas seaborn matplotlib scikit-learn imbalanced-learn xgboost scipy




Note: you may need to restart the kernel to use updated packages.


Import Libraries

In [68]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
import warnings
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint, uniform
from sklearn.neural_network import MLPClassifier

# Ignore warnings
warnings.simplefilter(action="ignore")


Import Dataset

In [69]:
# Load the dataset
df = pd.read_csv("data/diabetes.csv")

# Display dataset information
print("Dataset Info:")
print(df.info())
print("\nFirst 5 rows of the dataset:")
print(df.head())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

First 5 rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66         

Data Preprocessing

In [70]:
# Replace zeros with median for specific columns
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in columns_with_zeros:
    df[column] = df[column].replace(0, df[column].median())

# Separate features and target variable
X = df.drop(["Outcome"], axis=1)
y = df["Outcome"]

# Apply MinMaxScaler for normalization
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

In [71]:
# Feature Selection using Recursive Feature Elimination (RFE)
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rfe = RFE(log_reg, n_features_to_select=5)
rfe.fit(X_train, y_train)

selected_features = X_train.columns[rfe.support_]
print("\nSelected Features after RFE:", selected_features)

# Select only the top features from X_train and X_test
X_train = X_train[selected_features]
X_test = X_test[selected_features]


Selected Features after RFE: Index(['Pregnancies', 'Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age'], dtype='object')


In [72]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=12345)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [73]:


# Hyperparameter tuning for XGBoost
# Expanded hyperparameter grid
param_dist_xgb = {
    'n_estimators': randint(100, 500),  # Randomly sample between 100 and 500
    'max_depth': randint(3, 10),  # Randomly sample between 3 and 10
    'learning_rate': uniform(0.01, 0.3),  # Randomly sample between 0.01 and 0.3
    'subsample': uniform(0.6, 0.4),  # Randomly sample between 0.6 and 1.0
    'colsample_bytree': uniform(0.6, 0.4),  # Randomly sample between 0.6 and 1.0
    'gamma': uniform(0, 0.5),  # Randomly sample between 0 and 0.5
    'reg_alpha': uniform(0, 1),  # L1 regularization (randomly sample between 0 and 1)
    'reg_lambda': uniform(0, 1)  # L2 regularization (randomly sample between 0 and 1)
}

# Initialize XGBClassifier with early_stopping_rounds
xgb = XGBClassifier(random_state=12345, scale_pos_weight=1, early_stopping_rounds=10)

# Use RandomizedSearchCV for efficient hyperparameter tuning
random_search_xgb = RandomizedSearchCV(
    estimator=xgb, 
    param_distributions=param_dist_xgb, 
    n_iter=50,  # Number of parameter settings to sample
    cv=10,  # 10-fold cross-validation
    scoring='accuracy', 
    n_jobs=-1,  # Use all available cores
    random_state=12345
)

# Fit the model with early stopping
random_search_xgb.fit(
    X_train_balanced, 
    y_train_balanced, 
    eval_set=[(X_train_balanced, y_train_balanced)],  # Evaluation set for early stopping
    verbose=False
)

print("Best Parameters for XGBoost:", random_search_xgb.best_params_)
# Hyperparameter tuning for Random Forest
param_dist_rf = {
    'n_estimators': randint(100, 500),  # Randomly sample between 100 and 500
    'max_depth': [None, 10, 20, 30, 40, 50],  # Maximum depth of the tree
    'min_samples_split': randint(2, 20),  # Randomly sample between 2 and 20
    'min_samples_leaf': randint(1, 10),  # Randomly sample between 1 and 10
    'max_features': ['auto', 'sqrt', 'log2'],  # Different strategies for feature selection
    'bootstrap': [True, False]  # Whether to use bootstrap samples
}

rf = RandomForestClassifier(random_state=12345, class_weight='balanced')

# Use RandomizedSearchCV for more efficient hyperparameter tuning
random_search_rf = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=param_dist_rf, 
    n_iter=50,  # Number of parameter settings to sample
    cv=10,  # 10-fold cross-validation
    scoring='accuracy', 
    n_jobs=-1,  # Use all available cores
    random_state=12345
)

random_search_rf.fit(X_train_balanced, y_train_balanced)
print("Best Parameters for Random Forest:", random_search_rf.best_params_)

# Hyperparameter tuning for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'gamma': ['scale', 'auto', 0.1],  # Kernel coefficient
    'kernel': ['linear', 'rbf']  # Kernel type
}

svm = SVC(random_state=12345, probability=True)
grid_search_svm = GridSearchCV(
    estimator=svm, 
    param_grid=param_grid_svm, 
    cv=5,  # 5-fold cross-validation
    scoring='accuracy', 
    n_jobs=-1  # Use all available cores
)

grid_search_svm.fit(X_train_balanced, y_train_balanced)
print("Best Parameters for SVM:", grid_search_svm.best_params_)

# Hyperparameter tuning for Decision Tree
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at a leaf node
    'criterion': ['gini', 'entropy']  # Splitting criterion
}

dt = DecisionTreeClassifier(random_state=12345)
grid_search_dt = GridSearchCV(
    estimator=dt, 
    param_grid=param_grid_dt, 
    cv=5,  # 5-fold cross-validation
    scoring='accuracy', 
    n_jobs=-1  # Use all available cores
)

grid_search_dt.fit(X_train_balanced, y_train_balanced)
print("Best Parameters for Decision Tree:", grid_search_dt.best_params_)





# Hyperparameter tuning for Neural Network (MLPClassifier)
param_grid_nn = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],  # Number of neurons in each hidden layer
    'activation': ['relu', 'tanh'],  # Activation function
    'alpha': [0.0001, 0.001, 0.01],  # L2 regularization term
    'learning_rate': ['constant', 'adaptive'],  # Learning rate schedule
    'max_iter': [200, 300]  # Maximum number of iterations
}

# Initialize MLPClassifier
nn = MLPClassifier(random_state=12345)

# Perform GridSearchCV for MLPClassifier
grid_search_nn = GridSearchCV(
    estimator=nn, 
    param_grid=param_grid_nn, 
    cv=5,  # 5-fold cross-validation
    scoring='accuracy', 
    n_jobs=-1  # Use all available cores
)

# Fit the model
grid_search_nn.fit(X_train_balanced, y_train_balanced)

# Print best parameters
print("Best Parameters for Neural Network:", grid_search_nn.best_params_)

# Define and train individual models with best hyperparameters
models = {
    'XGBoost': XGBClassifier(random_state=12345, **random_search_xgb.best_params_),
    'Random Forest': RandomForestClassifier(random_state=12345, **random_search_rf.best_params_),
    'Support Vector Machine': SVC(random_state=12345, probability=True, **grid_search_svm.best_params_),
    'Neural Network': MLPClassifier(random_state=12345, **grid_search_nn.best_params_),
    'Decision Tree': DecisionTreeClassifier(random_state=12345, **grid_search_dt.best_params_)
}

Best Parameters for XGBoost: {'colsample_bytree': np.float64(0.9113943388538406), 'gamma': np.float64(0.1992471753294794), 'learning_rate': np.float64(0.19734464063457888), 'max_depth': 8, 'n_estimators': 361, 'reg_alpha': np.float64(0.4945498003146045), 'reg_lambda': np.float64(0.03738037598980304), 'subsample': np.float64(0.8876825332624142)}
Best Parameters for Random Forest: {'bootstrap': True, 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 11, 'n_estimators': 367}
Best Parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Best Parameters for Neural Network: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 300}


Base Models

In [None]:
# Define and train individual models
models = {
    'XGBoost': XGBClassifier(random_state=12345, **random_search_xgb.best_params_),
    'Random Forest': RandomForestClassifier(random_state=12345, **random_search_rf.best_params_),
    'Support Vector Machine': SVC(random_state=12345, probability=True, **grid_search_svm.best_params_),
    'Neural Network': MLPClassifier(random_state=12345, **grid_search_nn.best_params_),
    'Decision Tree': DecisionTreeClassifier(random_state=12345, **grid_search_dt.best_params_)
}
# Function to calculate additional metrics
def calculate_metrics(y_true, y_pred, y_pred_proba=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    gini_index = 2 * roc_auc_score(y_true, y_pred_proba) - 1 if y_pred_proba is not None else None
    auc_score = roc_auc_score(y_true, y_pred_proba) if y_pred_proba is not None else None
    auch = auc_score  # Approximate AUCH as AUC
    mer = 1 - accuracy_score(y_true, y_pred)
    mwc = (sensitivity + specificity) / 2
    precision = precision_score(y_true, y_pred, average='weighted')
    
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'Gini Index': gini_index,
        'AUC': auc_score,
        'AUCH': auch,
        'MER': mer,
        'MWC': mwc,
        'Precision': precision
    }

# Train and evaluate each model
performance_metrics = {}
for name, model in models.items():
    # Train the model
    model.fit(X_train_balanced, y_train_balanced)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Calculate performance metrics
    metrics = calculate_metrics(y_test, y_pred, y_pred_proba)
    
    # Store metrics
    performance_metrics[name] = metrics
    
    # Print metrics
    print(f"\n{name} Performance Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


XGBoost Performance Metrics:
Accuracy: 0.7078
Sensitivity: 0.6296
Specificity: 0.7500
Gini Index: 0.5856
AUC: 0.7928
AUCH: 0.7928
MER: 0.2922
MWC: 0.6898
Precision: 0.7147

Random Forest Performance Metrics:
Accuracy: 0.7208
Sensitivity: 0.6852
Specificity: 0.7400
Gini Index: 0.6378
AUC: 0.8189
AUCH: 0.8189
MER: 0.2792
MWC: 0.7126
Precision: 0.7340

Support Vector Machine Performance Metrics:
Accuracy: 0.7208
Sensitivity: 0.7037
Specificity: 0.7300
Gini Index: 0.6000
AUC: 0.8000
AUCH: 0.8000
MER: 0.2792
MWC: 0.7169
Precision: 0.7376

Neural Network Performance Metrics:
Accuracy: 0.7532
Sensitivity: 0.8148
Specificity: 0.7200
Gini Index: 0.6252
AUC: 0.8126
AUCH: 0.8126
MER: 0.2468
MWC: 0.7674
Precision: 0.7844

Decision Tree Performance Metrics:
Accuracy: 0.7597
Sensitivity: 0.6852
Specificity: 0.8000
Gini Index: 0.4852
AUC: 0.7426
AUCH: 0.7426
MER: 0.2403
MWC: 0.7426
Precision: 0.7632


In [58]:
# Define individual models for ensemble
models = [
    ('XGBoost', XGBClassifier(random_state=12345, **grid_search_xgb.best_params_)),
    ('Random Forest', RandomForestClassifier(random_state=12345)),
    ('Support Vector Machine', SVC(gamma='auto', random_state=12345, probability=True)),
    ('Neural Network', MLPClassifier(random_state=12345)),
    ('Decision Tree', DecisionTreeClassifier(random_state=12345))
]

# Create a Voting Classifier with majority voting
voting_clf = VotingClassifier(estimators=models, voting='soft')  # 'soft' for probability-based voting
voting_clf.fit(X_train_balanced, y_train_balanced)

# Evaluate performance metrics
y_pred = voting_clf.predict(X_test)
y_pred_proba = voting_clf.predict_proba(X_test)[:, 1]

# Calculate metrics
metrics = calculate_metrics(y_test, y_pred, y_pred_proba)
print("\nEnsemble Model (Voting Classifier) Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


Ensemble Model (Voting Classifier) Performance Metrics:
Accuracy: 0.7727
Sensitivity: 0.7593
Specificity: 0.7800
Gini Index: 0.6496
AUC: 0.8248
AUCH: 0.8248
MER: 0.2273
MWC: 0.7696
Precision: 0.7848


In [59]:
# Print results in a table format
print("\nModel Performance Summary:")
print("{:<20} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
    "Model", "Accuracy", "Sensitivity", "Specificity", "GI", "AUC", "AUCH", "MER", "MWC", "Precision"
))
for name, metrics in performance_metrics.items():
    print("{:<20} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
        name, metrics['Accuracy'], metrics['Sensitivity'], metrics['Specificity'], 
        metrics['Gini Index'], metrics['AUC'], metrics['AUCH'], metrics['MER'], metrics['MWC'], metrics['Precision']
    ))


Model Performance Summary:
Model                Accuracy   Sensitivity Specificity GI         AUC        AUCH       MER        MWC        Precision 
XGBoost              0.7078     0.6296     0.7500     0.5856     0.7928     0.7928     0.2922     0.6898     0.7147    
Random Forest        0.7208     0.6852     0.7400     0.6378     0.8189     0.8189     0.2792     0.7126     0.7340    
Support Vector Machine 0.7208     0.7037     0.7300     0.6000     0.8000     0.8000     0.2792     0.7169     0.7376    
Neural Network       0.7532     0.8148     0.7200     0.6252     0.8126     0.8126     0.2468     0.7674     0.7844    
Decision Tree        0.7597     0.6852     0.8000     0.4852     0.7426     0.7426     0.2403     0.7426     0.7632    
