# Model_X (version 3)

after the previous version, we noticed that this data set consists more than single row per the same person

# 1. Setup and Data Loading

In [2]:
# --- 1. Install Necessary Libraries ---
# (Run these lines once if you don't have them in your new environment)
# !pip install xgboost catboost shap

# --- 2. Import All Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap

# Preprocessing tools
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Model building and evaluation
# --- We now import GroupShuffleSplit for the patient-aware split ---
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool

# --- 3. Load Data ---
# IMPORTANT: Update this path to where your CSV file is located
DATA_FILE_PATH = './Dataset/Dementia Prediction Dataset.csv' 
TARGET_COLUMN = 'DEMENTED'
GROUP_COLUMN = 'NACCID'  # This is the Patient ID

try:
    # Use low_memory=False to avoid DtypeWarning during dtype inference.
    data = pd.read_csv(DATA_FILE_PATH, low_memory=False)
    print(f"Successfully loaded data. Shape: {data.shape}")
    print(f"Target column '{TARGET_COLUMN}' found.")
    print(f"Group column '{GROUP_COLUMN}' found.")
except (FileNotFoundError, KeyError):
    print(f"Error: Could not load data or find all necessary columns.")
    print("Please check DATA_FILE_PATH, TARGET_COLUMN, and GROUP_COLUMN variables.")
    # Create a small dummy dataset for demonstration
    data = pd.DataFrame({
        'NACCID': ['A', 'A', 'B', 'C', 'C', 'C', 'D', 'E', 'E', 'F'],
        'NACCAGE': [65, 66, 70, 80, 81, 82, 75, 999, 68, 69], 
        'SEX': [1, 1, 2, 1, 1, 1, 2, 1, 2, 1], 
        'EDUC': [12, 12, 16, 8, 8, 8, 20, 99, 14, 16],
        'MARISTAT': [1, 1, 2, 1, 1, 5, 5, 9, 2, 1], 
        'RACE': [1, 1, 1, 2, 2, 2, 5, 99, 1, 1], 
        'INRELTO': [1, 1, 2, 9, 3, 3, 3, 1, 1, 2],
        'CVHATT': [0, 0, 1, 2, 2, 2, 0, 9, 0, 1], 
        'CBSTROKE': [0, 0, 0, 1, 1, 1, 0, 9, 0, 0], 
        'DIABETES': [0, 1, 2, 1, 1, 1, 0, 9, 1, 0],
        'HYPERTEN': [1, 1, 1, 0, 0, 0, 0, 9, 1, 1], 
        'HYPERCHO': [1, 1, 0, 2, 2, 9, 0, 0, 1, 0], 
        'TBI': [0, 0, 0, 0, 0, 0, 0, 9, 1, 0], 
        'DEP2YRS': [0, 0, 1, 0, 0, 0, 0, 9, 1, 0], 
        'NACCBMI': [25.1, 25.5, 28.9, 32.0, 32.1, 32.2, 22.4, 888.8, 26.0, 27.0],
        'DEMENTED': [0, 0, 1, 1, 1, 1, 0, 1, 0, 1]
    })
    print(f"Loaded dummy data for demonstration. Shape: {data.shape}")

Successfully loaded data. Shape: (195196, 1024)
Target column 'DEMENTED' found.
Group column 'NACCID' found.


# 2. Preprocessing and Feature Engineering

In [None]:
# --- 1. Define Feature Sets ---
# Based on hackathon rules: A1, A2, A5, B1 are allowed.

# Numerical features that will be imputed (median) and scaled
numeric_features = [
    'NACCAGE',  # Subject's age
    'EDUC',     # Subject's years of education
    'NACCBMI'   # Subject's Body Mass Index
]

# Categorical features that will be imputed (most frequent) and one-hot encoded
categorical_features = [
    'SEX',      # Subject's sex
    'MARISTAT', # Marital status
    'RACE',     # Subject's race
    'INRELTO'   # Co-participant's relationship to subject
]

# Self-reported health history (Form A5)
# We will binarize these and engineer a new feature
health_history_features = [
    [cite_start]'CVHATT',   # Heart attack [cite: 32]
    [cite_start]'CBSTROKE', # Stroke [cite: 32]
    [cite_start]'DIABETES', # Diabetes [cite: 34]
    [cite_start]'HYPERTEN', # Hypertension [cite: 34]
    [cite_start]'HYPERCHO', # Hypercholesterolemia [cite: 34]
    [cite_start]'TBI',      # Traumatic Brain Injury [cite: 34]
    [cite_start]'DEP2YRS'   # Depression in last 2 years [cite: 36]
]

# --- 2. Clean Missing/Unknown Values ---
# We replace all non-standard "missing" or "unknown" codes with np.nan
# This is critical for scikit-learn's imputers to work.
missing_values_map = {
    [cite_start]'NACCAGE': [999],           # [cite: 420]
    [cite_start]'EDUC': [99],               # [cite: 367]
    [cite_start]'NACCBMI': [888.8, 888],    # [cite: 1118]
    [cite_start]'MARISTAT': [9],            # [cite: 377]
    [cite_start]'RACE': [99],               # [cite: 295]
    [cite_start]'INRELTO': [9],             # [cite: 588]
    # For Form A5, 9 means "Unknown"
    [cite_start]'CVHATT': [9], 'CBSTROKE': [9], 'DIABETES': [9], 'HYPERTEN': [9], # [cite: 1094, 1098, 1105, 1106]
    [cite_start]'HYPERCHO': [9], 'TBI': [9], 'DEP2YRS': [9] # [cite: 1107, 1102, 1115]
}

for col, missing_vals in missing_values_map.items():
    if col in data.columns:
        data[col] = data[col].replace(missing_vals, np.nan)

print("Replaced special 'missing' codes with NaN.")

# --- 3. Feature Engineering ---
# 3a. Binarize Health History
# We map 0=Absent to 0, and 1=Recent/Active or 2=Remote/Inactive to 1 (Present).
for col in health_history_features:
    if col in data.columns:
        data[col] = data[col].map({0: 0, 1: 1, 2: 1})

# 3b. Create 'ComorbidityCount'
# This counts how many conditions are present.
# We fill NaNs with 0 (assuming 'unknown' means 'absent') before summing.
data['ComorbidityCount'] = data[health_history_features].fillna(0).sum(axis=1)

# Add our new engineered feature to the numeric list
numeric_features.append('ComorbidityCount')
print("Engineered 'ComorbidityCount' feature.")

# --- 4. Separate Features (X) and Target (y) ---
if TARGET_COLUMN not in data.columns:
    print(f"FATAL ERROR: Target column '{TARGET_COLUMN}' not found!")
else:
    # X contains the raw features to be processed
    X = data[numeric_features + categorical_features]
    # y contains the final target
    y = data[TARGET_COLUMN]
    # groups contains the Patient IDs for the split
    groups = data[GROUP_COLUMN]

    # --- 5. Define Preprocessing Pipelines ---
    numeric_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # --- 6. Create the ColumnTransformer ---
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_pipeline, numeric_features),
            ('cat', categorical_pipeline, categorical_features)
        ],
        remainder='drop' # Drop any columns we didn't explicitly select
    )

    # --- 7. Apply Preprocessing ---
    # Fit and transform the entire dataset to prepare it
    X_processed = preprocessor.fit_transform(X)

    # Get the new feature names (e.g., after one-hot encoding)
    feature_names = preprocessor.get_feature_names_out()
    
    # Convert the processed data back to a DataFrame (useful for SHAP later)
    X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

    print("\n--- Preprocessing Complete ---")
    print(f"Shape of processed features (X): {X_processed.shape}")
    print(f"Shape of target (y): {y.shape}")
    print(f"Shape of groups (groups): {groups.shape}")
    print("\nProcessed features (first 5 rows):")
    print(X_processed_df.head())