In [1]:
# --------------------------------------------------------------
#  Dementia Risk Prediction â€“ Non-Medical Features Only
#  (XGBoost / LightGBM + Random Forest baseline)
# --------------------------------------------------------------

import pandas as pd
import numpy as np
import warnings, os
warnings.filterwarnings('ignore')

In [2]:
# ---------- 1. Load data ----------
df = pd.read_csv('Dementia Prediction Dataset.csv')

In [3]:
# ---------- 2. Compute age ----------
df['AGE'] = df['VISITYR'] - df['BIRTHYR']

In [4]:
# ---------- 3. Feature list (non-medical, self-knowable) ----------
features = [
    # Demographics
    'AGE', 'SEX', 'HISPANIC', 'RACE', 'EDUC', 'MARISTAT', 'INDEPEND', 'RESIDENC', 'HANDED',
    # Lifestyle
    'TOBAC30', 'TOBAC100', 'SMOKYRS', 'PACKSPER', 'QUITSMOK', 'ALCOCCAS', 'ALCFREQ',
    # Social
    'NACCLIVS', 'INLIVWTH', 'INVISITS', 'INCALLS', 'INRELY',
    # Family history
    'NACCFAM', 'NACCMOM', 'NACCDAD',
    # Simple self-reported diagnoses (yes/no or year)
    'CVHATT','HATTMULT','CVAFIB','CVANGIO','CVBYPASS','CVPACDEF','CVCHF','CVANGINA','CVHVALVE',
    'CBSTROKE','STROKMUL','CBTIA','TIAMULT','PD','SEIZURES',
    'TBI','TBIBRIEF','TBIEXTEN','TBIWOLOS',
    'DIABETES','HYPERTEN','HYPERCHO','THYROID','ARTHRIT',
    'INCONTU','INCONTF','APNEA',
    'ALCOHOL','ABUSOTHR','PTSD','BIPOLAR','SCHIZ','ANXIETY','DEP2YRS',
    # Height / weight / vision / hearing
    'HEIGHT','WEIGHT','VISION','HEARING'
]
target = 'DEMENTED'

# Keep only rows with a label
df = df[features + [target]].dropna(subset=[target])

In [5]:
# ---------- 4. Replace NACC special missing codes with NaN ----------
missing_codes = [-4, 8, 9, 88, 99, 888, 999, 8888, 9999]
for col in df.columns:
    df[col] = df[col].replace(missing_codes, np.nan)

In [6]:
# ---------- 5. Separate numeric / categorical ----------
numeric_cols   = df[features].select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = [c for c in features if c not in numeric_cols]

In [7]:
# ---------- 6. Train / test split ----------
from sklearn.model_selection import train_test_split
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

In [10]:
# --------------------------------------------------------------
# 7. Model pipelines (no imputer needed for XGBoost / LightGBM)
# --------------------------------------------------------------
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# One-hot for categorical columns only
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, categorical_cols)
    ], remainder='passthrough'   # numeric columns stay untouched
)

# ---------- Random Forest (baseline) ----------
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

rf_pipe = Pipeline(steps=[('prep', preprocess),
                          ('clf', rf)])

# ---------- XGBoost ----------
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss',
    use_label_encoder=False
)

xgb_pipe = Pipeline(steps=[('prep', preprocess),
                           ('clf', xgb_clf)])

# ---------- LightGBM ----------
import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(
    n_estimators=400,
    max_depth=-1,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgb_pipe = Pipeline(steps=[('prep', preprocess),
                           ('clf', lgb_clf)])

In [11]:
# --------------------------------------------------------------
# 8. Hyper-parameter tuning (RandomizedSearchCV)
# --------------------------------------------------------------
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

def tune_and_evaluate(pipe, param_dist, name):
    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist,
        n_iter=30,
        cv=5,
        scoring='roc_auc',
        random_state=42,
        n_jobs=-1,
        verbose=1
    )
    search.fit(X_train, y_train)
    best = search.best_estimator_
    print(f"\n=== {name} BEST AUC (CV): {search.best_score_:.4f} ===")
    return best

# ---- RF params ----
rf_params = {
    'clf__n_estimators': randint(200, 800),
    'clf__max_depth'   : [None] + list(randint(5, 30).rvs(5)),
    'clf__min_samples_split': randint(2, 10),
    'clf__min_samples_leaf' : randint(1, 5)
}

# ---- XGBoost params ----
xgb_params = {
    'clf__n_estimators'     : randint(200, 800),
    'clf__max_depth'        : randint(3, 10),
    'clf__learning_rate'    : uniform(0.01, 0.2),
    'clf__subsample'        : uniform(0.6, 0.4),
    'clf__colsample_bytree' : uniform(0.6, 0.4)
}

# ---- LightGBM params ----
lgb_params = {
    'clf__n_estimators'     : randint(200, 800),
    'clf__max_depth'        : randint(3, 12),
    'clf__learning_rate'    : uniform(0.01, 0.2),
    'clf__subsample'        : uniform(0.6, 0.4),
    'clf__colsample_bytree' : uniform(0.6, 0.4),
    'clf__num_leaves'       : randint(20, 150)
}

print("Tuning Random Forest...")
rf_best = tune_and_evaluate(rf_pipe, rf_params, "Random Forest")

print("\nTuning XGBoost...")
xgb_best = tune_and_evaluate(xgb_pipe, xgb_params, "XGBoost")

print("\nTuning LightGBM...")
lgb_best = tune_and_evaluate(lgb_pipe, lgb_params, "LightGBM")

Tuning Random Forest...
Fitting 5 folds for each of 30 candidates, totalling 150 fits

=== Random Forest BEST AUC (CV): 0.9474 ===

Tuning XGBoost...
Fitting 5 folds for each of 30 candidates, totalling 150 fits

=== XGBoost BEST AUC (CV): 0.9455 ===

Tuning LightGBM...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Number of positive: 46072, number of negative: 110084
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 978
[LightGBM] [Info] Number of data points in the train set: 156156, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.295038 -> initscore=-0.871038
[LightGBM] [Info] Start training from score -0.871038

=== LightGBM BEST AUC (CV): 0.9448 ===


In [12]:
# --------------------------------------------------------------
# 9. Final evaluation on hold-out test set
# --------------------------------------------------------------
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

def evaluate(model, name):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    print(f"\n--- {name} (test) ---")
    print(f"Accuracy : {acc:.4f}")
    print(f"ROC-AUC  : {auc:.4f}")
    print(classification_report(y_test, y_pred, digits=4))
    return auc

auc_rf  = evaluate(rf_best,  "Random Forest")
auc_xgb = evaluate(xgb_best, "XGBoost")
auc_lgb = evaluate(lgb_best, "LightGBM")

# Pick the winner
best_model = max([(auc_rf, rf_best, "Random Forest"),
                  (auc_xgb, xgb_best, "XGBoost"),
                  (auc_lgb, lgb_best, "LightGBM")], key=lambda x: x[0])[1]

print("\n" + "="*60)
print(f"BEST MODEL SELECTED: {best_model.named_steps['clf'].__class__.__name__}")
print("="*60)


--- Random Forest (test) ---
Accuracy : 0.9079
ROC-AUC  : 0.9495
              precision    recall  f1-score   support

           0     0.9339    0.9355    0.9347     27522
           1     0.8453    0.8418    0.8435     11518

    accuracy                         0.9079     39040
   macro avg     0.8896    0.8887    0.8891     39040
weighted avg     0.9078    0.9079    0.9078     39040


--- XGBoost (test) ---
Accuracy : 0.9066
ROC-AUC  : 0.9443
              precision    recall  f1-score   support

           0     0.9299    0.9381    0.9340     27522
           1     0.8490    0.8311    0.8400     11518

    accuracy                         0.9066     39040
   macro avg     0.8895    0.8846    0.8870     39040
weighted avg     0.9061    0.9066    0.9063     39040


--- LightGBM (test) ---
Accuracy : 0.9068
ROC-AUC  : 0.9440
              precision    recall  f1-score   support

           0     0.9308    0.9375    0.9341     27522
           1     0.8481    0.8334    0.8407     11

In [13]:
# --------------------------------------------------------------
# 10. Save the final model (optional)
# --------------------------------------------------------------
import joblib
joblib.dump(best_model, 'dementia_risk_nonmedical_best.pkl')
print("\nModel saved as 'dementia_risk_nonmedical_best.pkl'")


Model saved as 'dementia_risk_nonmedical_best.pkl'


In [14]:
# --------------------------------------------------------------
# 11. Example: predict risk % for a new person
# --------------------------------------------------------------
def predict_risk(row_dict):
    """
    row_dict: dict with the same column names as `features`.
    Returns risk percentage (0-100).
    """
    new = pd.DataFrame([row_dict])
    prob = best_model.predict_proba(new)[0, 1]
    return round(prob * 100, 2)

# ---- Demo ----
demo = {
    'AGE': 78, 'SEX': 2, 'HISPANIC': 0, 'RACE': 1, 'EDUC': 12,
    'MARISTAT': 1, 'INDEPEND': 1, 'RESIDENC': 1, 'HANDED': 1,
    'TOBAC30': 0, 'TOBAC100': 1, 'SMOKYRS': 20, 'PACKSPER': 1, 'QUITSMOK': 2000,
    'ALCOCCAS': 1, 'ALCFREQ': 2,
    'NACCLIVS': 1, 'INLIVWTH': 1, 'INVISITS': 3, 'INCALLS': 4, 'INRELY': 1,
    'NACCFAM': 1, 'NACCMOM': 1, 'NACCDAD': 0,
    'CVHATT': 0, 'HATTMULT': 0, 'CVAFIB': 0, 'CVANGIO': 0, 'CVBYPASS': 0,
    'CVPACDEF': 0, 'CVCHF': 0, 'CVANGINA': 0, 'CVHVALVE': 0,
    'CBSTROKE': 0, 'STROKMUL': 0, 'CBTIA': 0, 'TIAMULT': 0,
    'PD': 0, 'SEIZURES': 0,
    'TBI': 0, 'TBIBRIEF': 0, 'TBIEXTEN': 0, 'TBIWOLOS': 0,
    'DIABETES': 1, 'HYPERTEN': 1, 'HYPERCHO': 1, 'THYROID': 0, 'ARTHRIT': 1,
    'INCONTU': 0, 'INCONTF': 0, 'APNEA': 0,
    'ALCOHOL': 0, 'ABUSOTHR': 0, 'PTSD': 0, 'BIPOLAR': 0, 'SCHIZ': 0,
    'ANXIETY': 0, 'DEP2YRS': 0,
    'HEIGHT': 165, 'WEIGHT': 68, 'VISION': 0, 'HEARING': 0
}
risk_pct = predict_risk(demo)
print(f"\nDemo person dementia risk: {risk_pct}%")


Demo person dementia risk: 26.19%
