In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from numpy import arange
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, SelectFromModel, f_classif
from xgboost import XGBClassifier
from sklearn import set_config
from sklearn.decomposition import PCA
set_config(transform_output="pandas")

In [50]:
ad_df = pd.read_csv('/Users/stojanstojkovic/Downloads/alzheimers_disease_data.csv')
ad_df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,6895,61,0,0,1,39.121757,0,1.561126,4.049964,6.555306,...,0,0,4.492838,1,0,0,0,0,1,XXXConfid
2145,6896,75,0,0,2,17.857903,0,18.767261,1.360667,2.904662,...,0,1,9.204952,0,0,0,0,0,1,XXXConfid
2146,6897,77,0,0,1,15.476479,0,4.594670,9.886002,8.120025,...,0,0,5.036334,0,0,0,0,0,1,XXXConfid
2147,6898,78,1,3,1,15.299911,0,8.674505,6.354282,1.263427,...,0,0,3.785399,0,0,0,0,1,1,XXXConfid


In [52]:
X = ad_df.drop(columns=['DoctorInCharge']).set_index('PatientID')
y = X.pop('Diagnosis')              

In [54]:
X.columns

Index(['Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI', 'Smoking',
       'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'FamilyHistoryAlzheimers', 'CardiovascularDisease', 'Diabetes',
       'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP', 'DiastolicBP',
       'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness'],
      dtype='object')

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [10]:
profile = ProfileReport(ad_df, title="AD Report")
profile.to_file("ad_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [58]:
cat_cols = ['Ethnicity']
num_cols = [col for col in X.columns if col != 'Ethnicity']

In [60]:
# Use One-Hot Encoding for nominal categories with few unique values or binary categories (if not already in 0/1).
# Don’t use One-Hot Encoding for ordinal features or when the feature already has a numeric representation that conveys order.

cat_pipe = make_pipeline(
    OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist', min_frequency=0.03)
)
num_pipe = make_pipeline('passthrough') 

In [62]:
preprocessor = ColumnTransformer(
    transformers=[
    ('num_pipe', num_pipe, num_cols),
    ('cat_pipe', cat_pipe, cat_cols),
    ]
)

**DecisionTreeClassifier**

In [179]:
dt_pipeline = make_pipeline(preprocessor,
                            DecisionTreeClassifier())
dt_param_grid = {
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": arange(0.01, 0.11, 0.01)
}

dt_search = GridSearchCV(dt_pipeline,
                      dt_param_grid,
                      cv=5,
                      verbose=1)

dt_search.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [180]:
dt_search.best_params_
dt_search.best_score_
print(f"Best Parameters: {dt_search.best_params_}")
print(f"Best Score: {dt_search.best_score_:.4f}")

Best Parameters: {'decisiontreeclassifier__max_depth': 6, 'decisiontreeclassifier__min_samples_leaf': 0.01}
Best Score: 0.9296


In [183]:
y_pred_tree_train = dt_search.predict(X_train)
y_pred_tree_test = dt_search.predict(X_test)

In [185]:
train_acc = accuracy_score(y_true = y_train,
            y_pred = y_pred_tree_train
                          )
test_acc = accuracy_score(y_true = y_test,
           y_pred = y_pred_tree_test
                         )
print(f"DT Training accuracy_score: {train_acc}\nDT Testing accuracy_score: {test_acc}")

DT Training accuracy_score: 0.9429901105293775
DT Testing accuracy_score: 0.9209302325581395


**RandomForestClassifier**

In [188]:
rf_full_pipeline = make_pipeline(preprocessor,
                                 RandomForestClassifier())

rf_param_grid = {
    'randomforestclassifier__n_estimators': range(50, 201, 25), 
    'randomforestclassifier__max_depth': range(2, 14, 2),
    'randomforestclassifier__min_samples_leaf': arange(0.01, 0.11, 0.01)   
}                                                                                     

rf_search = GridSearchCV(rf_full_pipeline,
                         rf_param_grid,
                         cv=5,                                                        
                         verbose=1,
                         n_jobs=-1) #especially important for random forest

rf_search.fit(X_train, y_train)

Fitting 5 folds for each of 420 candidates, totalling 2100 fits


In [190]:
rf_search.best_params_
rf_search.best_score_
print(f"Best Parameters: {rf_search.best_params_}")
print(f"Best Score: {rf_search.best_score_:.4f}")

Best Parameters: {'randomforestclassifier__max_depth': 8, 'randomforestclassifier__min_samples_leaf': 0.01, 'randomforestclassifier__n_estimators': 50}
Best Score: 0.8988


In [193]:
y_pred_tree_train = rf_search.predict(X_train)
y_pred_tree_test = rf_search.predict(X_test)

train_acc = accuracy_score(y_true = y_train,
            y_pred = y_pred_tree_train
                          )
test_acc = accuracy_score(y_true = y_test,
           y_pred = y_pred_tree_test
                         )
print(f"RF Training accuracy_score: {train_acc}\nRF Testing accuracy_score: {test_acc}")

RF Training accuracy_score: 0.9179755671902269
RF Testing accuracy_score: 0.8744186046511628


**KNeighborsClassifier**

In [147]:
knn_full_pipeline = make_pipeline(preprocessor,
                                  StandardScaler(),
                                  KNeighborsClassifier())
knn_param_grid = {
    'standardscaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 
    'kneighborsclassifier__n_neighbors': range(1, 10),                    
    'kneighborsclassifier__weights': ['uniform', 'distance']              
}                                                                         

knn_search = GridSearchCV(knn_full_pipeline,
                          knn_param_grid,
                          cv=5,                                           
                          verbose=1,
                          n_jobs=-1)

knn_search.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [153]:
knn_search.best_params_
knn_search.best_score_
print(f"Best Parameters: {knn_search.best_params_}")
print(f"Best Score: {knn_search.best_score_:.4f}")

Best Parameters: {'kneighborsclassifier__n_neighbors': 9, 'kneighborsclassifier__weights': 'uniform', 'standardscaler': RobustScaler()}
Best Score: 0.7638


In [206]:
y_pred_tree_train = knn_search.predict(X_train)
y_pred_tree_test = knn_search.predict(X_test)

train_acc = accuracy_score(y_true = y_train,
            y_pred = y_pred_tree_train
                          )
test_acc = accuracy_score(y_true = y_test,
           y_pred = y_pred_tree_test
                         )
print(f"KNN Training accuracy_score: {train_acc}\nKNN Testing accuracy_score: {test_acc}")

KNN Training accuracy_score: 0.8080279232111692
KNN Testing accuracy_score: 0.7023255813953488


**SVC**

In [209]:
svm_full_pipeline = make_pipeline(preprocessor,
                                  StandardScaler(),
                                  PCA(n_components=5), #PCA should be done after scaling
                                  SVC(random_state=42))
svm_param_grid = {
    'svc__C': [10**e for e in range(-2,2)],                          #4 options
    'svc__kernel': ['linear', 'poly', 'rbf'],                        #3 options
    'svc__degree': range(2, 5),                                      #3 options
    'svc__gamma': ['scale', 'auto'] + [10**e for e in range(-3, -1)] #4 options
}                                                                    #144 fits per fold

svm_search = GridSearchCV(svm_full_pipeline,
                          svm_param_grid,
                          cv=5,                                      #720 fits total
                          verbose=2,
                          n_jobs=-1)

svm_search.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [210]:
svm_search.best_params_
svm_search.best_score_
print(f"Best Parameters: {svm_search.best_params_}")
print(f"Best Score: {svm_search.best_score_:.4f}")

Best Parameters: {'svc__C': 1, 'svc__degree': 2, 'svc__gamma': 'auto', 'svc__kernel': 'rbf'}
Best Score: 0.6626


In [211]:
y_pred_tree_train = svm_search.predict(X_train)
y_pred_tree_test = svm_search.predict(X_test)

train_acc = accuracy_score(y_true = y_train,
            y_pred = y_pred_tree_train
                          )
test_acc = accuracy_score(y_true = y_test,
           y_pred = y_pred_tree_test
                         )
print(f"SVM Training accuracy_score: {train_acc}\nSVM Testing accuracy_score: {test_acc}")

SVM Training accuracy_score: 0.6945898778359512
SVM Testing accuracy_score: 0.6627906976744186


***KNN - Feature Selection - SelectFromModel***
   

In [18]:
knn_full_pipeline = make_pipeline(preprocessor,
                                  StandardScaler(),
                                  SelectFromModel(RandomForestClassifier()),
                                  KNeighborsClassifier())
knn_param_grid = {
    'standardscaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 
    'kneighborsclassifier__n_neighbors': range(1, 10),                  
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'selectfrommodel__estimator__n_estimators': [100, 200],
    'selectfrommodel__estimator__max_depth': [10, 20]
}                                                                       

knn_search = GridSearchCV(knn_full_pipeline,
                          knn_param_grid,
                          cv=5,                                           
                          verbose=1,
                          n_jobs=-1)

knn_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [20]:
knn_search.best_params_
knn_search.best_score_
print(f"Best Parameters: {knn_search.best_params_}")
print(f"Best Score: {knn_search.best_score_:.4f}")

Best Parameters: {'kneighborsclassifier__n_neighbors': 8, 'kneighborsclassifier__weights': 'uniform', 'selectfrommodel__estimator__max_depth': 10, 'selectfrommodel__estimator__n_estimators': 100, 'standardscaler': MinMaxScaler()}
Best Score: 0.8825


In [22]:
y_pred_tree_train = knn_search.predict(X_train)
y_pred_tree_test = knn_search.predict(X_test)

train_acc = accuracy_score(y_true = y_train,
            y_pred = y_pred_tree_train
                          )
test_acc = accuracy_score(y_true = y_test,
           y_pred = y_pred_tree_test
                         )
print(f"KNN Training accuracy_score: {train_acc}\nKNN Testing accuracy_score: {test_acc}")

KNN Training accuracy_score: 0.9342641070389761
KNN Testing accuracy_score: 0.9255813953488372


***KNN with SelectKBest***

In [232]:
knn_full_pipeline = make_pipeline(
    preprocessor,
    StandardScaler(),
    SelectKBest(score_func=f_classif),  # Using SelectKBest instead of SelectFromModel
    KNeighborsClassifier()
)

# Define the parameter grid
knn_param_grid = {
    'standardscaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 
    'selectkbest__k': [5, 10, 15, 20],  # Number of best features to select
    'kneighborsclassifier__n_neighbors': range(1, 10),                  
    'kneighborsclassifier__weights': ['uniform', 'distance'],
}                                                                       

# Perform Grid Search
knn_search = GridSearchCV(knn_full_pipeline,
                          knn_param_grid,
                          cv=5,                                           
                          verbose=1,
                          n_jobs=-1)

# Fit the model
knn_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [234]:
knn_search.best_params_
knn_search.best_score_
print(f"Best Parameters: {knn_search.best_params_}")
print(f"Best Score: {knn_search.best_score_:.4f}")

Best Parameters: {'kneighborsclassifier__n_neighbors': 8, 'kneighborsclassifier__weights': 'distance', 'selectkbest__k': 5, 'standardscaler': StandardScaler()}
Best Score: 0.9261


In [236]:
y_pred_tree_train = knn_search.predict(X_train)
y_pred_tree_test = knn_search.predict(X_test)

train_acc = accuracy_score(y_true = y_train,
            y_pred = y_pred_tree_train
                          )
test_acc = accuracy_score(y_true = y_test,
           y_pred = y_pred_tree_test
                         )
print(f"KNN Training accuracy_score: {train_acc}\nKNN Testing accuracy_score: {test_acc}")

KNN Training accuracy_score: 1.0
KNN Testing accuracy_score: 0.9209302325581395


In [119]:
from sklearn.inspection import permutation_importance
import pandas as pd

# Get the best estimator from GridSearchCV
best_model = knn_search.best_estimator_

# Calculate permutation importance (works for any model, including KNN)
results = permutation_importance(best_model, X_train, y_train, scoring='accuracy')

# Create a DataFrame for better readability
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': results.importances_mean  # Mean importance score across all folds
})

# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the ranked feature importance
print("Ranked Feature Importance (Permutation Importance):")
print(importance_df)

Ranked Feature Importance (Permutation Importance):
                      Feature  Importance
23       FunctionalAssessment    0.171844
26                        ADL    0.159162
22                       MMSE    0.114136
24           MemoryComplaints    0.103781
25         BehavioralProblems    0.070739
0                         Age    0.000000
1                      Gender    0.000000
30  DifficultyCompletingTasks    0.000000
29         PersonalityChanges    0.000000
28             Disorientation    0.000000
27                  Confusion    0.000000
21   CholesterolTriglycerides    0.000000
20             CholesterolHDL    0.000000
19             CholesterolLDL    0.000000
18           CholesterolTotal    0.000000
17                DiastolicBP    0.000000
16                 SystolicBP    0.000000
15               Hypertension    0.000000
14                 HeadInjury    0.000000
13                 Depression    0.000000
12                   Diabetes    0.000000
11      CardiovascularDi

***XGBoost***

In [102]:
xgb_full_pipeline = make_pipeline(preprocessor, 
                                  XGBClassifier(scale_pos_weight=1.86, eval_metric='logloss'))

xgb_param_grid = {
    'xgbclassifier__n_estimators': range(100, 501, 50),
    'xgbclassifier__max_depth': range(2, 14, 2),
    'xgbclassifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'xgbclassifier__subsample': [0.6, 0.8, 1.0]
}

xgb_search = GridSearchCV(xgb_full_pipeline,
                          xgb_param_grid,
                          cv=5,
                          verbose=1,
                          n_jobs=-1)
xgb_search.fit(X_train, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


In [104]:
xgb_search.best_params_
xgb_search.best_score_
print(f"Best Parameters: {xgb_search.best_params_}")
print(f"Best Score: {xgb_search.best_score_:.4f}")

Best Parameters: {'xgbclassifier__learning_rate': 0.01, 'xgbclassifier__max_depth': 6, 'xgbclassifier__n_estimators': 250, 'xgbclassifier__subsample': 0.8}
Best Score: 0.9517


In [106]:
y_pred_tree_train = xgb_search.predict(X_train)
y_pred_tree_test = xgb_search.predict(X_test)

train_acc = accuracy_score(y_true = y_train,
            y_pred = y_pred_tree_train
                          )
test_acc = accuracy_score(y_true = y_test,
           y_pred = y_pred_tree_test
                         )
print(f"XGB Training accuracy_score: {train_acc}\nXBG Testing accuracy_score: {test_acc}")

XGB Training accuracy_score: 0.9691681210005817
XBG Testing accuracy_score: 0.9558139534883721


In [108]:
# Get the best estimator from the GridSearchCV
best_model = xgb_search.best_estimator_

# Access the booster (the trained model)
booster = best_model.named_steps['xgbclassifier'].get_booster()

# Get the feature importance using the 'weight' type (you can change to 'gain' or 'cover' if desired)
importance_scores = booster.get_score(importance_type='weight')

# Convert the dictionary to a pandas DataFrame for better readability
importance_df = pd.DataFrame(list(importance_scores.items()), columns=['Feature', 'Importance'])

# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the sorted feature importance
print("Ranked Feature Importance:")
print(importance_df)

Ranked Feature Importance:
                                Feature  Importance
21                       num_pipe__MMSE      1082.0
25                        num_pipe__ADL       982.0
22       num_pipe__FunctionalAssessment       911.0
24         num_pipe__BehavioralProblems       699.0
23           num_pipe__MemoryComplaints       553.0
7                 num_pipe__DietQuality       536.0
19             num_pipe__CholesterolHDL       468.0
6            num_pipe__PhysicalActivity       450.0
17           num_pipe__CholesterolTotal       412.0
0                         num_pipe__Age       407.0
18             num_pipe__CholesterolLDL       373.0
15                 num_pipe__SystolicBP       364.0
20   num_pipe__CholesterolTriglycerides       358.0
3                         num_pipe__BMI       353.0
8                num_pipe__SleepQuality       324.0
5          num_pipe__AlcoholConsumption       231.0
16                num_pipe__DiastolicBP       226.0
32                cat_pipe__Ethnicity

In [123]:
xgb_param_grid = {
    'xgbclassifier__n_estimators': range(100, 501, 25),  # More trees
    'xgbclassifier__max_depth': [3, 5, 7, 9],  # Tree depth
    'xgbclassifier__learning_rate': [0.005, 0.01, 0.03, 0.05, 0.1],  # Learning rate
    'xgbclassifier__subsample': [0.6, 0.8, 1.0],  # Row sampling
    'xgbclassifier__colsample_bytree': [0.5, 0.7, 1.0],  # Feature sampling
    'xgbclassifier__gamma': [0, 0.1, 0.3, 0.5],  # Regularization
    'xgbclassifier__reg_lambda': [0, 0.1, 1, 10],  # L2 Regularization
    'xgbclassifier__reg_alpha': [0, 0.1, 1, 10]  # L1 Regularization
}

# Create the RandomizedSearchCV object
xgb_random_search = RandomizedSearchCV(
    xgb_full_pipeline,  # Pipeline with XGBClassifier
    param_distributions=xgb_param_grid,  # Hyperparameters to search
    n_iter=30,  # Number of random combinations to test (can be increased for better results)
    scoring='roc_auc',  # Scoring metric (change to 'accuracy', 'f1', etc. if needed)
    cv=5,  # 5-fold cross-validation
    verbose=1,  # Print progress
    n_jobs=-1,  # Use all CPU cores
    random_state=42  # For reproducibility
)

# Fit the model
xgb_random_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", xgb_random_search.best_params_)

# Get the best model
best_xgb = xgb_random_search.best_estimator_

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'xgbclassifier__subsample': 0.6, 'xgbclassifier__reg_lambda': 10, 'xgbclassifier__reg_alpha': 0, 'xgbclassifier__n_estimators': 325, 'xgbclassifier__max_depth': 3, 'xgbclassifier__learning_rate': 0.01, 'xgbclassifier__gamma': 0.1, 'xgbclassifier__colsample_bytree': 0.7}
