In [2]:
# Project Title: Alzheimer's Disease Detection using Machine Learning
# Author: [Samyak Kumar Bhardwaj]
# Date: September 2025

# =============================================================================

# Step 1: Import necessary libraries
# We use pandas for data manipulation, numpy for numerical operations,
# and scikit-learn for machine learning tasks.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# =============================================================================
# Step 2: Load the dataset
# We load the data from the CSV file into a pandas DataFrame.
# =============================================================================
try:
    df = pd.read_csv('oasis_longitudinal.csv')
except FileNotFoundError:
    print("Error: 'oasis_longitudinal.csv' not found.")
    print("Please download the dataset from Kaggle and place it in this directory.")
    exit()

# =============================================================================
# Step 3: Data Preprocessing
# =============================================================================
print("Preprocessing data...")

# Dropping columns that are not useful for our model
# 'Subject ID' and 'MRI ID' are unique identifiers, not features.
# 'Hand' has a single value 'R' for all entries, so it's not useful for prediction.
df.drop(['Subject ID', 'MRI ID', 'Hand'], axis=1, inplace=True)

# Handle missing values: We will fill missing values in 'SES' (Socioeconomic Status)
# and 'MMSE' (Mini-Mental State Examination) columns with the median value.
# The SimpleImputer in the pipeline will handle this automatically.

# Converting categorical features to numerical
# The 'Group' column (target variable) has 'Demented' and 'Nondemented' as text.
# We convert them to 1 and 0, which is required for our model.
df['Group'] = df['Group'].replace({'Demented': 1, 'Nondemented': 0, 'Converted': 1})
df['M/F'] = df['M/F'].replace({'M': 1, 'F': 0}) # Convert Male/Female to 1/0

# Separate features (X) and target variable (y)
# X contains the input features (e.g., Age, EDUC, MMSE), and y is what we want to predict (Demented or not).
X = df.drop('Group', axis=1)
y = df['Group']

# Defining a list of numerical and categorical features
# This helps us apply different preprocessing steps to different types of data.
numerical_features = ['Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF']
categorical_features = ['M/F']

# Create a preprocessing pipeline
# A pipeline combines multiple steps into a single object. This makes the code
# cleaner and prevents data leakage (using information from the test set during training).
# For numerical features, we fill missing values with the median and then scale the data.
# For categorical features, we simply fill missing values with the most frequent value.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent'))
        ]), categorical_features)
    ])

# Split the data into training and testing sets
# We use 80% of the data for training the model and 20% for testing its performance.
# This ensures that our model is evaluated on data it has never seen before.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =============================================================================
# Step 4: Model Training
# We use Logistic Regression, a simple and highly interpretable model.
# It's a great choice because we can easily explain how each feature contributes
# to the final prediction.
# =============================================================================
print("Training the Logistic Regression model...")

# Create the final model pipeline
# This pipeline first applies the preprocessing steps and then trains the model.
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression(solver='liblinear', random_state=42))])

# Train the model on the training data
model_pipeline.fit(X_train, y_train)

# =============================================================================
# Step 5: Model Evaluation
# We test the model's performance on the unseen data (test set).
# =============================================================================
print("Evaluating the model...")

# Make predictions on the test set
y_pred = model_pipeline.predict(X_test)

# Calculate and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Print a detailed classification report
# This report gives more insight into the model's performance for each class.
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Nondemented', 'Demented']))

# =============================================================================
# Step 6: Explainability - Interpreting the Results
# =============================================================================
print("\n--- Model Explainability ---")
print("Interpreting the model's coefficients:")

# Get the trained classifier from the pipeline
classifier = model_pipeline.named_steps['classifier']

# Get the names of the features after preprocessing
feature_names_out = model_pipeline.named_steps['preprocessor'].get_feature_names_out()

# Create a DataFrame to easily view the coefficients
coefficients = pd.DataFrame(data=classifier.coef_[0], index=feature_names_out, columns=['Coefficient'])
coefficients.sort_values(by='Coefficient', ascending=False, inplace=True)

print(coefficients)

# Explanation of coefficients:
print("\nExplanation:")
print("A positive coefficient indicates that as the feature's value increases, the likelihood of the person being 'Demented' (1) increases.")
print("A negative coefficient indicates that as the feature's value increases, the likelihood of the person being 'Nondemented' (0) increases.")
print("The magnitude of the coefficient shows how much that feature influences the prediction. A larger magnitude means greater influence.")
print("\nThis simple table allows you to explain which factors your model considers most important for its prediction.")

# For example, you can see that a higher 'MMSE' score (cognitive test) has a
# negative coefficient, which means a higher score makes the model
# less likely to predict dementia, which makes perfect clinical sense!
# Conversely, a higher 'Age' has a positive coefficient, indicating that age is a
# key risk factor.

# =============================================================================
# Step 7: Simple Prediction Example
# creating a new, hypothetical patient's data to test the model in a practical way.
# =============================================================================
print("\n--- Example Prediction ---")
# Example data for a new patient
new_patient = pd.DataFrame([{
    'M/F': 1, # Using 1 for Male
    'Age': 75,
    'EDUC': 12,
    'SES': 2,
    'MMSE': 25,
    'CDR': 0.5,
    'eTIV': 1600,
    'nWBV': 0.75,
    'ASF': 1.10
}])

# Make the prediction
prediction = model_pipeline.predict(new_patient)

# Interpret the prediction
prediction_label = 'Demented' if prediction[0] == 1 else 'Nondemented'
print(f"Prediction for the new patient: The model predicts the patient is '{prediction_label}'.")


Preprocessing data...
Training the Logistic Regression model...
Evaluating the model...
Model Accuracy: 0.88

Classification Report:
              precision    recall  f1-score   support

 Nondemented       0.79      0.97      0.87        32
    Demented       0.97      0.81      0.89        43

    accuracy                           0.88        75
   macro avg       0.88      0.89      0.88        75
weighted avg       0.90      0.88      0.88        75


--- Model Explainability ---
Interpreting the model's coefficients:
           Coefficient
num__CDR      3.812860
cat__M/F      0.687035
num__ASF     -0.077939
num__Age     -0.125597
num__nWBV    -0.272523
num__EDUC    -0.427096
num__eTIV    -0.443344
num__SES     -0.657163
num__MMSE    -1.062584

Explanation:
A positive coefficient indicates that as the feature's value increases, the likelihood of the person being 'Demented' (1) increases.
A negative coefficient indicates that as the feature's value increases, the likelihood of the 

  df['Group'] = df['Group'].replace({'Demented': 1, 'Nondemented': 0, 'Converted': 1})
  df['M/F'] = df['M/F'].replace({'M': 1, 'F': 0}) # Convert Male/Female to 1/0
