## EDA

In [1]:
!pip3 install imblearn
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, roc_auc_score, roc_curve, classification_report
)
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold
%matplotlib inline

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.14.0 imblearn-0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
#load data set
df = pd.read_csv("/Users/macbook/Desktop/internship/data/alzheimers.csv")

In [3]:
#display dataset info
print("Dataset Shape:",df.shape)
print("\nFirst 5 Rows:")
df.head()

Dataset Shape: (2149, 34)

First 5 Rows:


Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,6.518877,0,0,1.725883,0,0,0,1,0,0
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,7.118696,0,0,2.592424,0,0,0,0,1,0
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,5.895077,0,0,7.119548,0,1,0,1,0,0
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,8.965106,0,1,6.481226,0,0,0,0,0,0
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,6.045039,0,0,0.014691,0,0,1,1,0,0


In [4]:
# Handle missing values
# Option 1: Drop rows with missing values
df = df.dropna()

In [5]:
#2.missing values
print("\nMissing Values per column:")
print(df.isnull().sum())


Missing Values per column:
PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyC

In [6]:
#3.Summary statistics
print("\nSummary statistics (numeric columns):")
print(df.describe().T)


Summary statistics (numeric columns):
                            count         mean         std          min  \
PatientID                  2149.0  5825.000000  620.507185  4751.000000   
Age                        2149.0    74.908795    8.990221    60.000000   
Gender                     2149.0     0.506282    0.500077     0.000000   
Ethnicity                  2149.0     0.697534    0.996128     0.000000   
EducationLevel             2149.0     1.286645    0.904527     0.000000   
BMI                        2149.0    27.655697    7.217438    15.008851   
Smoking                    2149.0     0.288506    0.453173     0.000000   
AlcoholConsumption         2149.0    10.039442    5.757910     0.002003   
PhysicalActivity           2149.0     4.920202    2.857191     0.003616   
DietQuality                2149.0     4.993138    2.909055     0.009385   
SleepQuality               2149.0     7.051081    1.763573     4.002629   
FamilyHistoryAlzheimers    2149.0     0.252210    0.434382   

## Preprocessing

In [7]:
# Split features and target
X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

# Train-test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
#Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
#Balancing the Dataset (SMOTE: Oversampling using imblearn library)
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

## Ensemble algorithms

In [10]:
#Logistic Regression 
lr_balanced = LogisticRegression(random_state=42)
lr_balanced.fit(X_train_balanced, y_train_balanced)
y_pred_lr_balanced = lr_balanced.predict(X_test_scaled)

print("\nLogistic Regression Performance (With Balancing):")
print(classification_report(y_test, y_pred_lr_balanced))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr_balanced))


Logistic Regression Performance (With Balancing):
              precision    recall  f1-score   support

           0       0.89      0.84      0.87       277
           1       0.74      0.80      0.77       153

    accuracy                           0.83       430
   macro avg       0.81      0.82      0.82       430
weighted avg       0.83      0.83      0.83       430

Confusion Matrix:
[[234  43]
 [ 30 123]]


In [11]:
#logistic regression
lr_balanced = LogisticRegression(random_state=42)
lr_balanced.fit(X_train_balanced, y_train_balanced)
y_pred_lr_balanced = lr_balanced.predict(X_test_scaled)

print("\nLogisitic Regression Performance (With Balancing):")
print(classification_report(y_test, y_pred_lr_balanced))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr_balanced))


Logisitic Regression Performance (With Balancing):
              precision    recall  f1-score   support

           0       0.89      0.84      0.87       277
           1       0.74      0.80      0.77       153

    accuracy                           0.83       430
   macro avg       0.81      0.82      0.82       430
weighted avg       0.83      0.83      0.83       430

Confusion Matrix:
[[234  43]
 [ 30 123]]


In [12]:
# Random forest
rf_balanced = RandomForestClassifier(random_state=42)
rf_balanced.fit(X_train_balanced, y_train_balanced)
y_pred_rf_balanced = rf_balanced.predict(X_test_scaled)

print("\nRandom forest performance (with balancing):")
print(classification_report(y_test, y_pred_rf_balanced))
print("\nConfusion Matrix")
print(confusion_matrix(y_test, y_pred_rf_balanced))


Random forest performance (with balancing):
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       277
           1       0.95      0.86      0.90       153

    accuracy                           0.93       430
   macro avg       0.94      0.92      0.92       430
weighted avg       0.93      0.93      0.93       430


Confusion Matrix
[[270   7]
 [ 22 131]]


In [13]:
#Hyperparameter Tuning (Random Forest) 
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}


grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5)
grid_search_rf.fit(X_train_balanced, y_train_balanced)

print("\nBest Parameters for Random Forest:")
print(grid_search_rf.best_params_)

# Evaluate tuned Random Forest
best_rf = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf.predict(X_test_scaled)

print("\nTuned Random Forest Performance:")
print(classification_report(y_test, y_pred_best_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best_rf))


Best Parameters for Random Forest:
{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}

Tuned Random Forest Performance:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       277
           1       0.97      0.90      0.93       153

    accuracy                           0.95       430
   macro avg       0.96      0.94      0.95       430
weighted avg       0.95      0.95      0.95       430

Confusion Matrix:
[[272   5]
 [ 15 138]]


In [14]:
#Gradient Boosting Trees (GBT)
gbt = GradientBoostingClassifier(random_state=42)
gbt.fit(X_train_balanced, y_train_balanced)
y_pred_gbt = gbt.predict(X_test_scaled)

print("\nGradient Boosting Trees Performance:")
print(classification_report(y_test, y_pred_gbt))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gbt))


Gradient Boosting Trees Performance:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       277
           1       0.94      0.94      0.94       153

    accuracy                           0.96       430
   macro avg       0.95      0.95      0.95       430
weighted avg       0.96      0.96      0.96       430

Confusion Matrix:
[[268   9]
 [  9 144]]


## Evaluation

In [15]:
#Cross-Validation (Random Forest)
cv_scores_rf = cross_val_score(best_rf, X_train_balanced, y_train_balanced, cv=5, scoring="accuracy")
print("\nRandom Forest Cross-Validation Scores:", cv_scores_rf)
print("Mean CV Accuracy:", cv_scores_rf.mean())


Random Forest Cross-Validation Scores: [0.94382022 0.96179775 0.9505618  0.96853933 0.96171171]
Mean CV Accuracy: 0.9572861625670616
