In [1]:
import pandas as pd

In [11]:
df_merged = pd.read_csv("../data/preprocessed-data/selected_features/ADNI_MERGE_FINAL_with_RAW_DX.csv")
df_base = pd.read_csv("../data/preprocessed-data/ADNI_MERGE_processed.csv")

In [12]:
def show_col (df):
    return df.columns.tolist()

In [13]:
print("Merged DataSet:\n")
print(show_col(df_merged),"\n\n")
print(df_merged.info,"\n")
print("Base DataSet:\n")
print(show_col(df_base),"\n\n")
print(df_base.info,"\n")

Merged DataSet:

['RID', 'PTID', 'COLPROT', 'ORIGPROT', 'EXAMDATE', 'DX_bl', 'APOE4', 'AV45', 'CDRSB_x', 'DIGITSCOR', 'EcogPtMem', 'EcogSPMem', 'EcogSPLang', 'EcogSPVisspat', 'EcogSPPlan', 'EcogSPOrgan', 'EcogSPDivatt', 'EcogSPTotal', 'FLDSTRENG', 'FSVERSION', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'ICV', 'DX', 'DIGITSCOR_bl', 'mPACCtrailsB_bl', 'Ventricles_bl', 'WholeBrain_bl', 'Fusiform_bl', 'MOCA_bl', 'EcogPtMem_bl', 'EcogPtLang_bl', 'EcogPtPlan_bl', 'EcogPtOrgan_bl', 'EcogPtDivatt_bl', 'EcogPtTotal_bl', 'EcogSPMem_bl', 'EcogSPLang_bl', 'EcogSPVisspat_bl', 'EcogSPPlan_bl', 'EcogSPOrgan_bl', 'EcogSPDivatt_bl', 'EcogSPTotal_bl', 'ABETA_bl', 'PIB_bl', 'AV45_bl', 'Years_bl', 'Month_bl', 'PTETHCAT_Not Hisp/Latino', 'PTRACCAT_Asian', 'PTRACCAT_Black', 'PTRACCAT_White', 'GENOTYPE', 'APTESTDT', 'APVOLUME', 'APRECEIVE', 'APAMBTEMP', 'RID.1', 'PHC_Visit', 'PHC_Age_Cognition', 'PHC_Diagnosis', 'PHC_Sex', 'PHC_Race', 'PHC_Ethnicity', 'PHC_Education', 'PHC_MEM', 'PHC_MEM_SE', 'PH

In [15]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.0 MB 599.9 kB/s eta 0:02:00
   ---------------------------------------- 0.5/72.0 MB 599.9 kB/s eta 0:02:00
   ---------------------------------------- 0.8/72.0 MB 610.3 kB/s eta 0:01:57
   ---------------------------------------- 0.8/72.0 MB 610.3 kB/s eta 0:01:57
    --------------------------------------- 1.0/72.0 MB 629.1 kB/s eta 0:01:53
    --------------------------------------- 1.3/72.0 MB 657.8 kB/s eta 0:01:48
    -------------------------

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score

# --- Configuration ---
DATA_PATH = "../data/preprocessed-data/selected_features/ADNI_MERGE_FINAL_with_RAW_DX.csv"
TARGET_COLUMN = 'DX'
RANDOM_SEED = 42
# Columns to exclude from features (IDs, dates, etc.)
ID_COLS = [
    'RID', 'PTID', 'EXAMDATE', 'DX_bl', 'RID.1', 'ID', 
    'SITEID', 'USERDATE2', 'APTESTDT'
]
# We use 'f1_weighted' for GridSearchCV to account for class imbalance
SCORING_METRIC = 'f1_weighted' 

# --- 1. Data Loading and Preprocessing ---

print("--- 1. Data Loading and Preprocessing ---")
try:
    df = pd.read_csv(DATA_PATH)
    # Filter out rows where the target 'DX' is missing
    df = df.dropna(subset=[TARGET_COLUMN])
    print(f"Data loaded successfully. Cleaned shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH}. Please check the path.")
    # Exiting if data isn't found
    exit()

# --- A. Encode Target Variable (DX) ---
le = LabelEncoder()
df['DX_Encoded'] = le.fit_transform(df[TARGET_COLUMN])
target_classes = le.classes_
print(f"Target classes encoded: {target_classes}")

# --- B. Feature Selection and Cleanup ---
TARGET = 'DX_Encoded'
features = [col for col in df.columns if col not in ID_COLS and col != TARGET_COLUMN and col != TARGET]
X = df[features]
y = df[TARGET]

# Drop columns with > 50% missing values
nan_threshold = 0.5 * len(X)
X = X.dropna(axis=1, thresh=nan_threshold)
print(f"Features remaining after dropping >50% NaN columns: {len(X.columns)}")

# Impute remaining missing values with the median
for col in X.columns:
    if X[col].dtype in ['float64', 'int64'] and X[col].isnull().any():
        X[col] = X[col].fillna(X[col].median())

# Convert Boolean columns to integers and keep only numeric features
X = X.replace({True: 1, False: 0}).select_dtypes(include=np.number)
print(f"Final feature matrix shape: {X.shape}")


# --- 2. Data Split (Train, Validation, Test) ---

# First split: Train (80%) and Temp (20%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

# Second split: Validation (10%) and Test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_SEED, stratify=y_temp
)

print("\n--- Data Split Shapes ---")
print(f"Training set:   {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set:       {X_test.shape}")


# --- 3. Hyperparameter Tuning using Grid Search CV (F1-Score) ---

print(f"\n--- 3. Starting Grid Search CV (Scoring: {SCORING_METRIC}) ---")

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.7, 0.9]
}

# Initialize the Base Model
base_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(target_classes),
    random_state=RANDOM_SEED,
    use_label_encoder=False,
    eval_metric='merror'
)

# Initialize GridSearchCV with f1_weighted scoring
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring=SCORING_METRIC,  # Key change: Use f1_weighted
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Run the search
grid_search.fit(X_train, y_train)

# Output best parameters and score
print("\n--- Grid Search Results ---")
print(f"Best Hyperparameters found: {grid_search.best_params_}")
print(f"Best CV Score ({SCORING_METRIC.upper()}): {grid_search.best_score_:.4f}")

# --- 4. Final Evaluation on the Test Set ---

print("\n--- 4. Final Evaluation on UNSEEN Test Set ---")

best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)

# Calculate both weighted F1-score and Accuracy for comparison
final_f1_score = f1_score(y_test, y_pred_test, average='weighted')
final_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Final Test Weighted F1-Score: {final_f1_score:.4f}")
print(f"Final Test Accuracy: {final_accuracy:.4f}")

# Classification Report (best way to see per-class performance)
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test, target_names=target_classes))

# Top Feature Importances
feature_importances = pd.Series(best_model.feature_importances_, index=X.columns)
print("\nTop 10 Feature Importances:")
print(feature_importances.nlargest(10))

--- 1. Data Loading and Preprocessing ---
Data loaded successfully. Cleaned shape: (16420, 270)
Target classes encoded: ['CN' 'Dementia' 'MCI']
Features remaining after dropping >50% NaN columns: 189
Final feature matrix shape: (16420, 189)


  X = X.replace({True: 1, False: 0}).select_dtypes(include=np.number)



--- Data Split Shapes ---
Training set:   (13136, 189)
Validation set: (1642, 189)
Test set:       (1642, 189)

--- 3. Starting Grid Search CV (Scoring: f1_weighted) ---
Fitting 3 folds for each of 108 candidates, totalling 324 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Grid Search Results ---
Best Hyperparameters found: {'colsample_bytree': 0.9, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.9}
Best CV Score (F1_WEIGHTED): 0.9451

--- 4. Final Evaluation on UNSEEN Test Set ---
Final Test Weighted F1-Score: 0.9514
Final Test Accuracy: 0.9513

Classification Report (Test Set):
              precision    recall  f1-score   support

          CN       0.95      0.95      0.95       402
    Dementia       0.90      0.94      0.92       245
         MCI       0.96      0.95      0.96       995

    accuracy                           0.95      1642
   macro avg       0.94      0.95      0.94      1642
weighted avg       0.95      0.95      0.95      1642


Top 10 Feature Importances:
CDRSB_x          0.123118
PHC_Diagnosis    0.083210
CDGLOBAL         0.071861
DXMDUE           0.066708
DXAPP            0.054951
DIAGNOSIS        0.050790
DXMPTR1          0.039373
FSVERSION        0.021677
DXMPTR6          0.016810
CDSOURCE    

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import label_binarize

# --- Configuration for df_base ---
DATA_PATH_BASE = "../data/preprocessed-data/selected_features/ADNI_BASE_FINAL.csv"
TARGET_COLUMN = 'DX'
RANDOM_SEED = 42
SCORING_METRIC = 'f1_weighted' 
# ID/Date/Metadata columns specific to df_base to exclude from features
ID_COLS_BASE = [
    'COLPROT', 'ORIGPROT', 'EXAMDATE', 'EXAMDATE_bl', 
    'update_stamp', 'DX_bl', 'VISCODE_num'
]

# --- 1. Data Loading and Preprocessing  ---

print("--- 1. Data Loading and Preprocessing (Base DataSet) ---")
try:
    df = pd.read_csv(DATA_PATH_BASE)
    df = df.dropna(subset=[TARGET_COLUMN])
    print(f"Base DataSet loaded successfully. Cleaned shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH_BASE}. Please check the path.")
    exit()

# --- A. Encode Target Variable (DX) ---
le = LabelEncoder()
df['DX_Encoded'] = le.fit_transform(df[TARGET_COLUMN])
target_classes = le.classes_
n_classes = len(target_classes)
print(f"Target classes encoded ({n_classes}): {target_classes}")

# --- B. Feature Selection and Cleanup ---
TARGET = 'DX_Encoded'
features = [col for col in df.columns if col not in ID_COLS_BASE and col != TARGET_COLUMN and col != TARGET]
X = df[features]
y = df[TARGET]

# Drop columns with > 50% missing values
nan_threshold = 0.5 * len(X)
X = X.dropna(axis=1, thresh=nan_threshold)
print(f"Features remaining after dropping >50% NaN columns: {len(X.columns)}")

# Convert Boolean columns to integers and keep only numeric features
X = X.replace({True: 1, False: 0}).select_dtypes(include=np.number)
print(f"Final feature matrix shape: {X.shape}")


# --- 2. Data Split (Train, Validation, Test)  ---

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_SEED, stratify=y_temp
)


# --- 3. Hyperparameter Tuning using Grid Search CV (F1-Score)  ---

print(f"\n--- 3. Starting Grid Search CV (Scoring: {SCORING_METRIC}) ---")

param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.7, 0.9]
}

base_model = XGBClassifier(
    objective='multi:softprob', # Use softprob to get probabilities for ROC AUC
    num_class=n_classes,
    random_state=RANDOM_SEED,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring=SCORING_METRIC,
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("\n--- Grid Search Results ---")
print(f"Best Hyperparameters found: {grid_search.best_params_}")
print(f"Best CV Score ({SCORING_METRIC.upper()}): {grid_search.best_score_:.4f}")

# --- 4. Final Evaluation on the Test Set (F1 & ROC AUC)  ---

print("\n--- 4. Final Evaluation on UNSEEN Test Set ---")

best_model = grid_search.best_estimator_

# Get predicted class labels (for Accuracy, F1)
y_pred_test = best_model.predict(X_test)
# Get predicted probabilities (required for ROC AUC)
y_prob_test = best_model.predict_proba(X_test)

# --- A. F1-Score & Accuracy ---
final_f1_score = f1_score(y_test, y_pred_test, average='weighted')
final_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Final Test Weighted F1-Score: {final_f1_score:.4f}")
print(f"Final Test Accuracy: {final_accuracy:.4f}")

# --- B. Multi-Class ROC AUC Score ---
# Binarize the true labels for the OvR calculation
y_test_binarized = label_binarize(y_test, classes=np.arange(n_classes))

try:
    # Calculate micro-averaged ROC AUC
    # Micro-average is preferred for imbalanced data as it aggregates the contribution of all classes
    micro_auc = roc_auc_score(y_test_binarized, y_prob_test, average='micro')
    
    # Calculate macro-averaged ROC AUC
    # Macro-average treats all classes equally
    macro_auc = roc_auc_score(y_test_binarized, y_prob_test, average='macro')
    
    print(f"\nFinal Test ROC AUC (Micro-Average): {micro_auc:.4f}")
    print(f"Final Test ROC AUC (Macro-Average): {macro_auc:.4f}")

except ValueError as e:
    print(f"\nCould not compute ROC AUC. Error: {e}")
    print("This often happens if the model's predictions only contain a single class.")


# --- C. Classification Report & Feature Importances ---
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test, target_names=target_classes))

feature_importances = pd.Series(best_model.feature_importances_, index=X.columns)
print("\nTop 10 Feature Importances:")
print(feature_importances.nlargest(10))

--- 1. Data Loading and Preprocessing (Base DataSet) ---
Base DataSet loaded successfully. Cleaned shape: (16421, 118)
Target classes encoded (3): ['CN' 'Dementia' 'MCI']
Features remaining after dropping >50% NaN columns: 110
Final feature matrix shape: (16421, 106)

--- 3. Starting Grid Search CV (Scoring: f1_weighted) ---
Fitting 3 folds for each of 108 candidates, totalling 324 fits


  X = X.replace({True: 1, False: 0}).select_dtypes(include=np.number)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Grid Search Results ---
Best Hyperparameters found: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.7}
Best CV Score (F1_WEIGHTED): 0.9424

--- 4. Final Evaluation on UNSEEN Test Set ---
Final Test Weighted F1-Score: 0.9556
Final Test Accuracy: 0.9556

Final Test ROC AUC (Micro-Average): 0.9942
Final Test ROC AUC (Macro-Average): 0.9932

Classification Report (Test Set):
              precision    recall  f1-score   support

          CN       0.97      0.94      0.96       402
    Dementia       0.91      0.92      0.92       245
         MCI       0.96      0.97      0.96       996

    accuracy                           0.96      1643
   macro avg       0.95      0.94      0.95      1643
weighted avg       0.96      0.96      0.96      1643


Top 10 Feature Importances:
CDRSB           0.172893
mPACCtrailsB    0.086407
CDRSB_bl        0.055830
mPACCdigit      0.049990
FAQ             0.049743
LDELTOTAL_BL    0.030638
MMSE     