<a href="https://colab.research.google.com/github/shreya-gh/shreya-gh/blob/main/dementia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

r3v3nant_nacccsv_path = kagglehub.dataset_download('r3v3nant/nacccsv')

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

# 1. Load Dataset (Selected Features Only)
print("Loading dataset...")
selected_columns = ["SEX", "HISPANIC", "HISPOR", "RACE", "RACEX", "RACESEC", "RACESECX", "RACETER", "RACETERX",
                    "PRIMLANG", "PRIMLANX", "EDUC", "MARISTAT", "NACCLIVS", "INDEPEND", "RESIDENC", "HANDED", "NACCFAM",
                    "NACCMOM", "NACCDAD", "NACCAM", "NACCAMX", "NACCAMS", "NACCOM", "NACCFADM", "NACCFFTD", "ANYMEDS",
                    "TOBAC100", "TOBAC30", "PACKSPER", "ALCOCCAS", "ALCFREQ", "CVHATT", "HATTMULT", "CVAFIB", "CVANGIO",
                    "CVBYPASS", "CVPACDEF", "CVPACE", "CVCHF", "CVANGINA", "CVHVALVE", "CVOTHR", "CBSTROKE", "STROKMUL",
                    "PD", "SEIZURES", "NACCTBI", "TBI", "TBIBRIEF", "TRAUMBRF", "TBIEXTEN", "TRAUMEXT", "TBIWOLOS", "TRAUMCHR",
                    "DIABETES", "DIABTYPE", "HYPERTEN", "HYPERCHO", "B12DEF", "THYROID", "ARTHRIT", "ARTHTYPE", "ARTHUPEX",
                    "ARTHLOEX", "ARTHSPIN", "ARTHUNK", "INCONTU", "INCONTF", "APNEA", "RBD", "INSOMN", "OTHSLEEP", "ALCOHOL",
                    "ABUSOTHR", "ABUSX", "PTSD", "BIPOLAR", "SCHIZ", "DEP2YRS", "DEPOTHR", "ANXIETY", "OCD", "NPSYDEV",
                    "PSYCDIS", "HEIGHT", "WEIGHT", "BPSYS", "BPDIAS", "HRATE", "VISION", "VISCORR", "VISWCORR", "HEARING",
                    "HEARAID", "HEARWAID", "NACCAGE", "NACCNIHR", "NACCBMI", "NACCMOCA", "NACCNE4S", "THYDIS", "NACCUDSD"]

df = pd.read_csv("/kaggle/input/nacccsv/investigator_nacc68.csv", usecols=selected_columns)

# Convert all known missing values to NaN
missing_values = [8888,9999,888, 999,9,-4.4, -4]
df.replace(missing_values, np.nan, inplace=True)


# 2. Remove Records with Age Less Than 60
print("Removing records with age less than 60...")
df = df[df["NACCAGE"] >= 60]  # Keep only records where age is 60 or older
print(f"Number of records after removing age < 60: {len(df)}")

# Define target variable
y = df["NACCUDSD"].map(lambda x: 1 if x == 4 else 0 if x in [1, 2, 3] else np.nan)

df = df.dropna(subset=["NACCUDSD"])
X = df.drop(columns=["NACCUDSD"])


X = X.select_dtypes(exclude=["object"])

# 3. Drop Features with >50% Missing Values

print("Dropping features with >50% missing values...")
missing_percent = X.isna().mean()
X = X.loc[:, missing_percent <= 0.5]
print(f"Features Remaining After Dropping >50% Missing: {X.shape[1]}")

print("Splitting dataset into train, validation, and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)  # 60-20-20 split

# ================================
# 5. Handle Missing Values (MICE)
# ================================
print("Performing missing value imputation...")
imputer = IterativeImputer(max_iter=30, tol=1e-4, random_state=42)
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_train.columns)


print("Applying standardization...")
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)


# 7.  SMOTE (Handling Imbalance)

print("Applying SMOTE to balance classes...")
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:")
print(pd.Series(y_train).value_counts())

# 8. Random UnderSampling (After SMOTE)
print("Applying Random UnderSampling to balance classes further...")
under_sampler = RandomUnderSampler(random_state=42)
X_train, y_train = under_sampler.fit_resample(X_train, y_train)

print("Class distribution after undersampling:")
print(pd.Series(y_train).value_counts())

# ================================
# 9. Remove Low-Variance Features (Updated Variance Threshold)
# ================================
print("Removing low-variance features with a threshold of 0.05...")
var_thresh = VarianceThreshold(threshold=0.05)  # Updated threshold
X_train_var = var_thresh.fit_transform(X_train)
selected_features = X_train.columns[var_thresh.get_support()]
X_train_var = pd.DataFrame(X_train_var, columns=selected_features)
X_val_var = pd.DataFrame(var_thresh.transform(X_val), columns=selected_features)
X_test_var = pd.DataFrame(var_thresh.transform(X_test), columns=selected_features)

print(f"Features After Variance Check: {len(selected_features)}")
print("Selected Features After Variance Threshold:")
print(selected_features)

# ================================
# 10. Feature Selection with Permutation Importance
# ================================
print("Performing feature selection with Permutation Importance...")
# Train a Random Forest model on the uncorrelated features
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_var, y_train)

# Calculate permutation importance
perm_importance = permutation_importance(rf_model, X_val_var, y_val, n_repeats=10, random_state=42)

# Create a DataFrame for permutation importance results
perm_importance_df = pd.DataFrame({
    "Feature": X_train_var.columns,
    "Importance": perm_importance.importances_mean,
    "Std": perm_importance.importances_std
}).sort_values(by="Importance", ascending=False)

# Display top 30 features by permutation importance
print("Top 30 Features by Permutation Importance:")
print(perm_importance_df.head(30))

# Select top 30% of features based on permutation importance
num_selected = int(len(perm_importance_df) * 0.3)
selected_features = perm_importance_df.iloc[:num_selected]["Feature"].tolist()
X_train_selected = X_train_var[selected_features]
X_val_selected = X_val_var[selected_features]
X_test_selected = X_test_var[selected_features]

print(f"Features After Permutation Importance Selection: {len(selected_features)}")
print("Selected Features After Permutation Importance:")
print(selected_features)


# 11. Remove Highly Correlated Features

print("Removing highly correlated features after feature selection...")
corr_matrix = X_train_selected.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.85)]
X_train_final = X_train_selected.drop(columns=high_corr_features)
X_val_final = X_val_selected.drop(columns=high_corr_features, errors="ignore")
X_test_final = X_test_selected.drop(columns=high_corr_features, errors="ignore")

print(f"Final Features After Collinearity Check: {X_train_final.shape[1]}")
print("Final Selected Features:")
print(list(X_train_final.columns))


print("Performing cross-validation...")
rf_final = RandomForestClassifier(n_estimators=100, random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_final, X_train_final, y_train, cv=cv, scoring="accuracy")

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores)}")


# 13. Final Model on Full Training Set
print("Training final model on the full training set...")
rf_final.fit(X_train_final, y_train)

# Evaluate on the validation set
y_val_pred = rf_final.predict(X_val_final)
print("Validation Set Classification Report:")
print(classification_report(y_val, y_val_pred))
print("Validation Set Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

print("Evaluating on the test set...")
y_test_pred = rf_final.predict(X_test_final)
print("Test Set Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Test Set Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))


# Permutation Importance Graph
print("Plotting permutation feature importance graph...")
plt.figure(figsize=(10, 8))
plt.barh(perm_importance_df["Feature"], perm_importance_df["Importance"], xerr=perm_importance_df["Std"], color="skyblue")
plt.xlabel("Permutation Importance")
plt.ylabel("Feature")
plt.title("Permutation Feature Importance")
plt.gca().invert_yaxis()
plt.show()

# Random Forest Feature Importance Graph
print("Plotting Random Forest feature importance graph...")
rf_importance = pd.DataFrame({
    "Feature": X_train_final.columns,
    "Importance": rf_final.feature_importances_
}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(rf_importance["Feature"], rf_importance["Importance"], color="lightgreen")
plt.xlabel("Random Forest Feature Importance")
plt.ylabel("Feature")
plt.title("Random Forest Feature Importance")
plt.gca().invert_yaxis()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns


# 1. Load Dataset with Final Selected Features

print("Loading dataset with final selected features...")
selected_features = ['NACCBMI', 'DEP2YRS', 'NACCNE4S', 'NACCLIVS','TRAUMBRF','B12DEF','INCONTU',
                     'NACCAGE', 'BPSYS','HRATE', 'SEIZURES', 'RESIDENC',
                     'EDUC','CBSTROKE', 'PACKSPER', 'NACCUDSD', 'ALCOHOL','NACCFAM',
                     'MARISTAT','SEX','VISWCORR','NACCMOCA','NACCMMSE','INDEPEND']

df = pd.read_csv("/kaggle/input/nacccsv/investigator_nacc68.csv", usecols=selected_features)

missing_values = [8888, 9999, 888, 999, -4.4, -4]
df.replace(missing_values, np.nan, inplace=True)

# Additional replacement for missing values
columns_to_replace = ['RESIDENC', 'DEP2YRS', 'NACCNE4S','TRAUMBRF','B12DEF','SEIZURES', 'MARISTAT','ALCOHOL','CBSTROKE','NACCLIVS','INCONTU','INDEPEND']
df[columns_to_replace] = df[columns_to_replace].replace(9, np.nan)
df["EDUC"] = df["EDUC"].replace(99, np.nan)
df["PACKSPER"] = df["PACKSPER"].replace([8, 9], np.nan)
df["VISWCORR"] = df["VISWCORR"].replace([8, 9], np.nan)
df["NACCMMSE"] = df["NACCMMSE"].replace([95, 96, 97, 98], np.nan)
df["NACCMOCA"] = df["NACCMOCA"].replace([88, 99], np.nan)

# Remove records of those aged below 60
df = df[df["NACCAGE"] >= 60]

print(f"Dataset Shape (Rows, Columns): {df.shape}")

y = df["NACCUDSD"].map(lambda x: 1 if x == 4 else 0 if x in [1, 2, 3] else x)

# Drop rows where target is NaN
df = df.dropna(subset=["NACCUDSD"])
X = df.drop(columns=["NACCUDSD"])

# Drop redundant features
X = X.drop(columns=["ALCOHOL"])

print("Splitting dataset into train, validation, and test sets...")
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Class Distribution Before SMOTE:")
print(y_train.value_counts())

# 4. MICE (Imputation)
# ================================
print("Performing missing value imputation...")
imputer = IterativeImputer(max_iter=30, tol=1e-4, random_state=42)
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_train.columns)

print("Applying standardization...")
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)

# 6. SMOTE
print("Applying SMOTE to balance classes...")
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Class Distribution After SMOTE:")
print(y_train.value_counts())


# 7. Stratified Cross-Validation (No Training)
print("Performing stratified cross-validation for evaluation...")
rf_final = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=5,
    min_samples_leaf=1,
    max_depth=None,
    random_state=42
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_validate(rf_final, X_train, y_train, cv=skf, scoring=['accuracy', 'f1', 'roc_auc'], return_train_score=True)

print(f"Cross-Validation Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"Cross-Validation F1-Score: {np.mean(cv_results['test_f1']):.4f}")
print(f"Cross-Validation ROC-AUC: {np.mean(cv_results['test_roc_auc']):.4f}")


print("Training final model on training set...")
rf_final.fit(X_train, y_train)
print("Evaluating final model on validation set...")
y_val_pred = rf_final.predict(X_val)
y_val_proba = rf_final.predict_proba(X_val)[:, 1]

print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"Validation F1-Score: {f1_score(y_val, y_val_pred):.4f}")
print(f"Validation ROC-AUC Score: {roc_auc_score(y_val, y_val_proba):.4f}")


# 10. Feature Importance Graph
print("Plotting feature importance...")
importances = rf_final.feature_importances_
feature_names = X_train.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), [feature_names[i] for i in indices], rotation=90)
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

#
# 11. Evaluate on Test Set

print("Evaluating final model on test set...")
y_test_pred = rf_final.predict(X_test)
y_test_proba = rf_final.predict_proba(X_test)[:, 1]

print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Test F1-Score: {f1_score(y_test, y_test_pred):.4f}")
print(f"Test ROC-AUC Score: {roc_auc_score(y_test, y_test_proba):.4f}")

cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Test Set)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


# 13. INDEPEND Feature Analysis
# ================================
print("\n Statistics for INDEPEND:")
print(X_test["INDEPEND"].describe())

test_df = X_test.copy()
test_df["NACCUDSD"] = y_test.reset_index(drop=True)

plt.figure(figsize=(8, 6))
sns.boxplot(x="NACCUDSD", y="INDEPEND", data=test_df)
plt.title("INDEPEND by Dementia Status (NACCUDSD)")
plt.xlabel("Dementia Status (0 = No, 1 = Yes)")
plt.ylabel("Level of Independence (INDEPEND)")
plt.show()

print("\nMean INDEPEND by Dementia Status:")
print(test_df.groupby("NACCUDSD")["INDEPEND"].mean())


In [None]:
import pandas as pd
import numpy as np

# Load dataset with all columns
df = pd.read_csv("/kaggle/input/nacccsv/investigator_nacc68.csv")

# Print initial dataset shape
print(f"Initial dataset shape: {df.shape}")

