#ADNI MERGE FINAL with RAW DX

**XGBoost:**

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from joblib import dump

# =========================================================
# 1. Load data
# =========================================================
csv_path = "ADNI_MERGE_FINAL_with_RAW_DX.csv"   # put the CSV in the same folder or give full path
df = pd.read_csv(csv_path)

print("Shape:", df.shape)
print("Columns:", list(df.columns))

# =========================================================
# 2. Choose target column & drop ID-like columns from features
# =========================================================
# Use DX as the label (change to "DIAGNOSIS" if you prefer)
TARGET_COL = "DX"

# Some columns that are clearly IDs / meta and shouldn't be used as predictors
ID_COLUMNS = [
    "RID", "PTID", "RID.1", "ID", "SITEID", "USERDATE2"
]

# Keep only those ID columns that actually exist in the dataframe
id_cols_present = [c for c in ID_COLUMNS if c in df.columns]

# Separate X and y
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL] + id_cols_present)

print("Features shape:", X.shape)
print("Target value counts:")
print(y.value_counts())

# =========================================================
# 3. Encode target labels (handles string labels like CN/MCI/AD)
# =========================================================
label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

num_classes = len(label_encoder.classes_)
print("Number of classes in target:", num_classes)
print("Classes:", label_encoder.classes_)

# Decide XGBoost objective based on number of classes
if num_classes <= 2:
    objective = "binary:logistic"
    eval_metric = "logloss"
else:
    objective = "multi:softprob"
    eval_metric = "mlogloss"

# =========================================================
# 4. Split into train / test
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_enc,
    test_size=0.2,
    random_state=42,
    stratify=y_enc  # preserves class distribution
)

# =========================================================
# 5. Build preprocessing for numeric & categorical features
# =========================================================
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

print("Numeric columns:", len(numeric_cols))
print("Categorical columns:", len(categorical_cols))

numeric_transformer = SimpleImputer(strategy="median")

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# =========================================================
# 6. Define XGBoost model
# =========================================================
xgb_params = dict(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective=objective,
    eval_metric=eval_metric,
    tree_method="hist",   # fast on CPU; use "gpu_hist" if you have GPU
    n_jobs=-1,
    random_state=42,
)

if num_classes > 2:
    xgb_params["num_class"] = num_classes

xgb_model = XGBClassifier(**xgb_params)

# Full pipeline: preprocessing + model
clf = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", xgb_model),
    ]
)

# =========================================================
# 7. Train
# =========================================================
print("\nTraining XGBoost model...")
clf.fit(X_train, y_train)

# =========================================================
# 8. Evaluate
# =========================================================
print("\nEvaluating on test set...")
y_pred_enc = clf.predict(X_test)
y_pred_labels = label_encoder.inverse_transform(y_pred_enc)
y_test_labels = label_encoder.inverse_transform(y_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred_enc))

print("\nClassification report (per class):")
print(classification_report(
    y_test,
    y_pred_enc,
    target_names=label_encoder.classes_
))

# =========================================================
# 9. Save model + label encoder
# =========================================================
dump(clf, "adni_xgb_pipeline.joblib")
dump(label_encoder, "adni_label_encoder.joblib")
print("\nSaved pipeline to 'adni_xgb_pipeline.joblib' and label encoder to 'adni_label_encoder.joblib'.")


Shape: (16420, 151)
Columns: ['RID', 'PTID', 'COLPROT', 'ORIGPROT', 'EXAMDATE', 'DX_bl', 'APOE4', 'AV45', 'CDRSB_x', 'DIGITSCOR', 'EcogPtMem', 'EcogSPMem', 'EcogSPLang', 'EcogSPVisspat', 'EcogSPPlan', 'EcogSPOrgan', 'EcogSPDivatt', 'EcogSPTotal', 'FLDSTRENG', 'FSVERSION', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'ICV', 'DX', 'DIGITSCOR_bl', 'mPACCtrailsB_bl', 'Ventricles_bl', 'WholeBrain_bl', 'Fusiform_bl', 'MOCA_bl', 'EcogPtMem_bl', 'EcogPtLang_bl', 'EcogPtPlan_bl', 'EcogPtOrgan_bl', 'EcogPtDivatt_bl', 'EcogPtTotal_bl', 'EcogSPMem_bl', 'EcogSPLang_bl', 'EcogSPVisspat_bl', 'EcogSPPlan_bl', 'EcogSPOrgan_bl', 'EcogSPDivatt_bl', 'EcogSPTotal_bl', 'ABETA_bl', 'PIB_bl', 'AV45_bl', 'Years_bl', 'Month_bl', 'PTETHCAT_Not Hisp/Latino', 'PTRACCAT_Asian', 'PTRACCAT_Black', 'PTRACCAT_White', 'GENOTYPE', 'APTESTDT', 'APVOLUME', 'APRECEIVE', 'APAMBTEMP', 'RID.1', 'PHC_Visit', 'PHC_Age_Cognition', 'PHC_Diagnosis', 'PHC_Sex', 'PHC_Race', 'PHC_Ethnicity', 'PHC_Education', 'PHC_MEM', 'PHC_

**Random Forest:**

In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from joblib import dump

# =========================================================
# 1. Load data
# =========================================================
csv_path = "ADNI_MERGE_FINAL_with_RAW_DX.csv"
df = pd.read_csv(csv_path)

print("Shape:", df.shape)
print("Columns:", list(df.columns))

# =========================================================
# 2. Target + drop ID-like columns
# =========================================================
TARGET_COL = "DX"

ID_COLUMNS = [
    "RID", "PTID", "RID.1", "ID", "SITEID", "USERDATE2"
]

id_cols_present = [c for c in ID_COLUMNS if c in df.columns]

y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL] + id_cols_present)

print("Features shape:", X.shape)
print("Target value counts:")
print(y.value_counts())

# =========================================================
# 3. Encode target (DX)
# =========================================================
label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

print("Number of classes:", len(label_encoder.classes_))
print("Classes:", label_encoder.classes_)

# =========================================================
# 4. Train / test split
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_enc,
    test_size=0.2,
    random_state=42,
    stratify=y_enc
)

# =========================================================
# 5. Preprocessing: numeric & categorical
# =========================================================
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

print("Numeric columns:", len(numeric_cols))
print("Categorical columns:", len(categorical_cols))

numeric_transformer = SimpleImputer(strategy="median")

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# =========================================================
# 6. Random Forest model
# =========================================================
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,          # let trees grow fully; you can tune this
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"  # helps if classes are imbalanced
)

clf = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", rf_model),
    ]
)

# =========================================================
# 7. Train
# =========================================================
print("\nTraining Random Forest on ADNI_MERGE_FINAL_with_RAW_DX.csv ...")
clf.fit(X_train, y_train)

# =========================================================
# 8. Evaluate
# =========================================================
y_pred = clf.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:")
print(
    classification_report(
        y_test,
        y_pred,
        target_names=label_encoder.classes_
    )
)

# =========================================================
# 9. Save model + encoder
# =========================================================
dump(clf, "rf_adni_raw_dx_pipeline.joblib")
dump(label_encoder, "rf_adni_raw_dx_label_encoder.joblib")
print("\nSaved pipeline to 'rf_adni_raw_dx_pipeline.joblib' and label encoder to 'rf_adni_raw_dx_label_encoder.joblib'.")

Shape: (16420, 151)
Columns: ['RID', 'PTID', 'COLPROT', 'ORIGPROT', 'EXAMDATE', 'DX_bl', 'APOE4', 'AV45', 'CDRSB_x', 'DIGITSCOR', 'EcogPtMem', 'EcogSPMem', 'EcogSPLang', 'EcogSPVisspat', 'EcogSPPlan', 'EcogSPOrgan', 'EcogSPDivatt', 'EcogSPTotal', 'FLDSTRENG', 'FSVERSION', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'ICV', 'DX', 'DIGITSCOR_bl', 'mPACCtrailsB_bl', 'Ventricles_bl', 'WholeBrain_bl', 'Fusiform_bl', 'MOCA_bl', 'EcogPtMem_bl', 'EcogPtLang_bl', 'EcogPtPlan_bl', 'EcogPtOrgan_bl', 'EcogPtDivatt_bl', 'EcogPtTotal_bl', 'EcogSPMem_bl', 'EcogSPLang_bl', 'EcogSPVisspat_bl', 'EcogSPPlan_bl', 'EcogSPOrgan_bl', 'EcogSPDivatt_bl', 'EcogSPTotal_bl', 'ABETA_bl', 'PIB_bl', 'AV45_bl', 'Years_bl', 'Month_bl', 'PTETHCAT_Not Hisp/Latino', 'PTRACCAT_Asian', 'PTRACCAT_Black', 'PTRACCAT_White', 'GENOTYPE', 'APTESTDT', 'APVOLUME', 'APRECEIVE', 'APAMBTEMP', 'RID.1', 'PHC_Visit', 'PHC_Age_Cognition', 'PHC_Diagnosis', 'PHC_Sex', 'PHC_Race', 'PHC_Ethnicity', 'PHC_Education', 'PHC_MEM', 'PHC_

#ADNI MERGE Processed

**XGBoost:**

In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from joblib import dump

# =========================================================
# 1. Load data
# =========================================================
csv_path = "ADNI_MERGE_processed.csv"   # make sure the file is here or use full path
df = pd.read_csv(csv_path)

print("Shape:", df.shape)
print("Columns:", list(df.columns))

# =========================================================
# 2. Choose target column & (optionally) drop ID/meta columns
# =========================================================
TARGET_COL = "DX"   # change to "DIAGNOSIS" if needed

# If you later discover any "ID-like" columns here, add them to this list
ID_COLUMNS = [
    # e.g. "RID", "PTID" etc., if they exist in this file
]

id_cols_present = [c for c in ID_COLUMNS if c in df.columns]

y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL] + id_cols_present)

print("Features shape:", X.shape)
print("Target value counts:")
print(y.value_counts())

# =========================================================
# 3. Encode target labels
# =========================================================
label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

num_classes = len(label_encoder.classes_)
print("Number of classes in target:", num_classes)
print("Classes:", label_encoder.classes_)

if num_classes <= 2:
    objective = "binary:logistic"
    eval_metric = "logloss"
else:
    objective = "multi:softprob"
    eval_metric = "mlogloss"

# =========================================================
# 4. Train / test split
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_enc,
    test_size=0.2,
    random_state=42,
    stratify=y_enc,
)

# =========================================================
# 5. Preprocessing: numeric & categorical
# =========================================================
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

print("Numeric columns:", len(numeric_cols))
print("Categorical columns:", len(categorical_cols))

numeric_transformer = SimpleImputer(strategy="median")

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# =========================================================
# 6. XGBoost model
# =========================================================
xgb_params = dict(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective=objective,
    eval_metric=eval_metric,
    tree_method="hist",    # use "gpu_hist" if you have GPU
    n_jobs=-1,
    random_state=42,
)

if num_classes > 2:
    xgb_params["num_class"] = num_classes

xgb_model = XGBClassifier(**xgb_params)

clf = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", xgb_model),
    ]
)

# =========================================================
# 7. Train
# =========================================================
print("\nTraining XGBoost model on ADNI_MERGE_processed.csv...")
clf.fit(X_train, y_train)

# =========================================================
# 8. Evaluate
# =========================================================
print("\nEvaluating on test set...")
y_pred_enc = clf.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred_enc))

print("\nClass mapping (encoded -> original DX value):")
for i, cls in enumerate(label_encoder.classes_):
    print(f"{i} -> {cls}")

# Convert class names to strings for classification_report
class_names = [str(c) for c in label_encoder.classes_]

print("\nClassification report:")
print(
    classification_report(
        y_test,
        y_pred_enc,
        target_names=class_names
    )
)

# =========================================================
# 9. Save model + encoder
# =========================================================
dump(clf, "adni_processed_xgb_pipeline.joblib")
dump(label_encoder, "adni_processed_label_encoder.joblib")
print("\nSaved pipeline to 'adni_processed_xgb_pipeline.joblib' and label encoder to 'adni_processed_label_encoder.joblib'.")

Shape: (16421, 115)
Columns: ['COLPROT', 'ORIGPROT', 'EXAMDATE', 'DX_bl', 'AGE', 'PTGENDER', 'PTEDUCAT', 'PTMARRY', 'APOE4', 'FDG', 'PIB', 'AV45', 'FBB', 'ABETA', 'TAU', 'PTAU', 'CDRSB', 'ADAS11', 'ADAS13', 'ADASQ4', 'MMSE', 'RAVLT_immediate', 'RAVLT_learning', 'RAVLT_forgetting', 'RAVLT_perc_forgetting', 'LDELTOTAL', 'DIGITSCOR', 'TRABSCOR', 'FAQ', 'MOCA', 'EcogPtMem', 'EcogPtLang', 'EcogPtVisspat', 'EcogPtPlan', 'EcogPtOrgan', 'EcogPtDivatt', 'EcogPtTotal', 'EcogSPMem', 'EcogSPLang', 'EcogSPVisspat', 'EcogSPPlan', 'EcogSPOrgan', 'EcogSPDivatt', 'EcogSPTotal', 'FLDSTRENG', 'FSVERSION', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV', 'DX', 'mPACCdigit', 'mPACCtrailsB', 'EXAMDATE_bl', 'CDRSB_bl', 'ADAS11_bl', 'ADAS13_bl', 'ADASQ4_bl', 'MMSE_bl', 'RAVLT_immediate_bl', 'RAVLT_learning_bl', 'RAVLT_forgetting_bl', 'RAVLT_perc_forgetting_bl', 'LDELTOTAL_BL', 'DIGITSCOR_bl', 'TRABSCOR_bl', 'FAQ_bl', 'mPACCdigit_bl', 'mPACCtrailsB_bl', 'FLDSTRENG_bl', 'FS

**Random Forest:**

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from joblib import dump

# =========================================================
# 1. Load data
# =========================================================
csv_path = "ADNI_MERGE_processed.csv"
df = pd.read_csv(csv_path)

print("Shape:", df.shape)
print("Columns:", list(df.columns))

# =========================================================
# 2. Target + (optional) ID columns
# =========================================================
TARGET_COL = "DX"

ID_COLUMNS = [
    # Add any ID-like columns here if present (e.g. "RID", "PTID", etc.)
]

id_cols_present = [c for c in ID_COLUMNS if c in df.columns]

y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL] + id_cols_present)

print("Features shape:", X.shape)
print("Target value counts:")
print(y.value_counts())

# =========================================================
# 3. Encode target
# =========================================================
label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

print("Number of classes:", len(label_encoder.classes_))
print("Classes:", label_encoder.classes_)

# =========================================================
# 4. Train / test split
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_enc,
    test_size=0.2,
    random_state=42,
    stratify=y_enc
)

# =========================================================
# 5. Preprocessing
# =========================================================
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

print("Numeric columns:", len(numeric_cols))
print("Categorical columns:", len(categorical_cols))

numeric_transformer = SimpleImputer(strategy="median")

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# =========================================================
# 6. Random Forest model
# =========================================================
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"
)

clf = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", rf_model),
    ]
)

# =========================================================
# 7. Train
# =========================================================
print("\nTraining Random Forest on ADNI_MERGE_processed.csv ...")
clf.fit(X_train, y_train)

# =========================================================
# 8. Evaluate
# =========================================================
print("\nEvaluating on test set...")
y_pred_enc = clf.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred_enc))

print("\nClass mapping (encoded -> original DX value):")
for i, cls in enumerate(label_encoder.classes_):
    print(f"{i} -> {cls}")

# Convert class names to strings for classification_report
class_names = [str(c) for c in label_encoder.classes_]

print("\nClassification report:")
print(
    classification_report(
        y_test,
        y_pred_enc,
        target_names=class_names
    )
)

# =========================================================
# 9. Save model + encoder
# =========================================================
dump(clf, "rf_adni_processed_pipeline.joblib")
dump(label_encoder, "rf_adni_processed_label_encoder.joblib")
print("\nSaved pipeline to 'rf_adni_processed_pipeline.joblib' and label encoder to 'rf_adni_processed_label_encoder.joblib'.")


Shape: (16421, 115)
Columns: ['COLPROT', 'ORIGPROT', 'EXAMDATE', 'DX_bl', 'AGE', 'PTGENDER', 'PTEDUCAT', 'PTMARRY', 'APOE4', 'FDG', 'PIB', 'AV45', 'FBB', 'ABETA', 'TAU', 'PTAU', 'CDRSB', 'ADAS11', 'ADAS13', 'ADASQ4', 'MMSE', 'RAVLT_immediate', 'RAVLT_learning', 'RAVLT_forgetting', 'RAVLT_perc_forgetting', 'LDELTOTAL', 'DIGITSCOR', 'TRABSCOR', 'FAQ', 'MOCA', 'EcogPtMem', 'EcogPtLang', 'EcogPtVisspat', 'EcogPtPlan', 'EcogPtOrgan', 'EcogPtDivatt', 'EcogPtTotal', 'EcogSPMem', 'EcogSPLang', 'EcogSPVisspat', 'EcogSPPlan', 'EcogSPOrgan', 'EcogSPDivatt', 'EcogSPTotal', 'FLDSTRENG', 'FSVERSION', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV', 'DX', 'mPACCdigit', 'mPACCtrailsB', 'EXAMDATE_bl', 'CDRSB_bl', 'ADAS11_bl', 'ADAS13_bl', 'ADASQ4_bl', 'MMSE_bl', 'RAVLT_immediate_bl', 'RAVLT_learning_bl', 'RAVLT_forgetting_bl', 'RAVLT_perc_forgetting_bl', 'LDELTOTAL_BL', 'DIGITSCOR_bl', 'TRABSCOR_bl', 'FAQ_bl', 'mPACCdigit_bl', 'mPACCtrailsB_bl', 'FLDSTRENG_bl', 'FS