In [None]:
import pandas as pd
import numpy as np
import os

from scipy.stats import zscore #tells how far datapoint is from mean (error measurement)
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

from sklearn.decomposition import PCA #dimensionality reduction
from sklearn.impute import SimpleImputer #Replaces missing values with some kind of standard or qualitative data
from sklearn.preprocessing import OneHotEncoder #Converts categorical data into numerical 
from sklearn.preprocessing import StandardScaler #standardizes features by scaling to unit variance and removing mean

from sklearn.linear_model import LogisticRegression #imports model 
#Import other models we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import PrecisionRecallDisplay
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier


from sklearn.model_selection import cross_validate #estimates skill of model on new data
from sklearn.metrics import roc_auc_score #compares true positives / false positives
from sklearn import metrics
from sklearn.inspection import permutation_importance

import joblib
import pickle

In [None]:
#reads in compiled, merged df
merged_nhanes_df = pd.read_csv('data/processed/merged_nhanes.csv')

In [None]:
merged_nhanes_df['CFDCST1'] = zscore(merged_nhanes_df['CFDCST1'])
merged_nhanes_df['CFDCSR'] = zscore(merged_nhanes_df['CFDCSR'])
merged_nhanes_df['CFDDS'] = zscore(merged_nhanes_df['CFDDS'])
merged_nhanes_df['CFDAST'] = zscore(merged_nhanes_df['CFDAST'])

In [None]:
merged_nhanes_df['cognitive_decline_label'] = merged_nhanes_df[['CFDCST1', 'CFDCSR','CFDDS', 'CFDAST']].mean(axis = 1)
merged_nhanes_df['cognitive_decline_label'] = merged_nhanes_df['cognitive_decline_label'] < merged_nhanes_df['cognitive_decline_label'].quantile(0.1)
merged_nhanes_df

In [None]:
[x for x in merged_nhanes_df.columns]

In [None]:
label = ['cognitive_decline_label']
features = ['MCQ053', 'SLQ060', 'RIAGENDR', 'RIDAGEYR', 
            'RIDRETH3', 'INDHHIN2', 'BPQ020','BPQ080', 
            'DIQ010', 'PAD680', 'CDQ001', 'DPQ020', 
            'MCQ080', 'MCQ084', 'MCQ140', 'MCQ160c', 
            'MCQ160f', 'MCQ220', 'HUQ050', 'DUQ272', 
            'DBQ700']
#discluded_features = [x for x in features if x not in merged_nhanes_df.columns]
features = [x for x in features if x in merged_nhanes_df.columns]
len(features)

In [None]:
for feature in features:
    merged_nhanes_df[feature] = np.floor(merged_nhanes_df[feature]).astype('Int64')

In [None]:
y = merged_nhanes_df[label].values.ravel() #compresses 2D array to 1D
X = merged_nhanes_df[features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pickle.dump(X_train, open('data/processed/X_train.sav', 'wb'))
pickle.dump(X_test, open('data/processed/X_test.sav', 'wb'))
pickle.dump(y_train, open('data/processed/y_train.sav', 'wb'))
pickle.dump(y_test, open('data/processed/y_test.sav', 'wb'))

In [None]:
if 'models' not in os.listdir():
    os.mkdir('models')

In [None]:
lr_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(handle_unknown='ignore')),
    ('clf', LogisticRegression(max_iter=1000))
])
lr_pipe.fit(X_train, y_train)
lr_training_score = roc_auc_score(y_train, lr_pipe.predict_proba(X_train)[:, 1])
lr_scores = cross_validate(lr_pipe, X_train, y_train, cv=5, scoring='roc_auc')

joblib.dump(lr_pipe, 'models/logistic_regression_pipeline.sav')

print(lr_training_score)
print(lr_scores['test_score'].mean())
print(lr_scores['test_score'].std())

In [None]:
rf_pipe = pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), 
                 ('clf', RandomForestClassifier(warm_start=True))])
rf_pipe.fit(X_train, y_train)
rf_training_score = roc_auc_score(y_train, rf_pipe.predict_proba(X_train)[:, 1])
rf_scores = cross_validate(rf_pipe, X_train, y_train, cv=5, scoring='roc_auc')
rf_predicts = rf_pipe.predict(X_test)
print(metrics.classification_report(y_test, rf_predicts))

joblib.dump(lr_pipe, 'models/random_forest_pipeline.sav')

print(rf_training_score)
print(rf_scores['test_score'].mean())
print(rf_scores['test_score'].std())

# the confusion matrix for the rf model
rf_cm = confusion_matrix(y_test,rf_predicts)
rf_cm_df = pd.DataFrame(
    rf_cm, 
    index = ['True','False'],
    columns = [col for col in ['CD (pred)', 'No CD (pred)']])
plt.figure(figsize = (10,7))
sns.heatmap(rf_cm_df, annot=True)
plt.show()

In [None]:
knn_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(handle_unknown='ignore')),
    ('clf', KNeighborsClassifier(n_neighbors=2))
])
knn_pipe.fit(X_train, y_train)
knn_training_score = roc_auc_score(y_train, knn_pipe.predict_proba(X_train)[:, 1])
knn_scores = cross_validate(knn_pipe, X_train, y_train, cv=5, scoring='roc_auc')
knn_predicts = knn_pipe.predict(X_test)
print(metrics.classification_report(y_test, knn_predicts))

joblib.dump(lr_pipe, 'models/k_nearest_neighbors_pipeline.sav')

print(knn_training_score)
print(knn_scores['test_score'].mean())
print(knn_scores['test_score'].std())


# the confusion matrix for the rf model
knn_cm = confusion_matrix(y_test,knn_predicts)
knn_cm_df = pd.DataFrame(
    knn_cm, 
    index = ['True','False'],
    columns = [col for col in ['CD (pred)', 'No CD (pred)']])
plt.figure(figsize = (10,7))
sns.heatmap(knn_cm_df, annot=True)
plt.show()

In [None]:
xgb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(handle_unknown='ignore')),
    ('clf', xgb.XGBClassifier())
])

xgb_pipe.fit(X_train, y_train)
xgb_training_score = roc_auc_score(y_train, xgb_pipe.predict_proba(X_train)[:, 1])
xgb_scores = cross_validate(xgb_pipe, X_train, y_train, cv=5, scoring='roc_auc')
xgb_predicts = xgb_pipe.predict(X_test)
print(metrics.classification_report(y_test, xgb_predicts))

joblib.dump(lr_pipe, 'models/gradient_boost_pipeline.sav')

print(xgb_training_score)
print(xgb_scores['test_score'].mean())
print(xgb_scores['test_score'].std())

# the confusion matrix for the rf model
xgb_cm = confusion_matrix(y_test,xgb_predicts)
xgb_cm_df = pd.DataFrame(
    xgb_cm, 
    index = ['True','False'],
    columns = [col for col in ['CD (pred)', 'No CD (pred)']])
plt.figure(figsize = (10,7))
sns.heatmap(xgb_cm_df, annot=True)
plt.show()

In [None]:
nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(handle_unknown='ignore')),
    ('clf', MultinomialNB())
])
nb_pipe.fit(X_train, y_train)
nb_training_score = roc_auc_score(y_train, nb_pipe.predict_proba(X_train)[:, 1])
nb_scores = cross_validate(nb_pipe, X_train, y_train, cv=5, scoring='roc_auc')

joblib.dump(lr_pipe, 'models/naive_bayes_pipeline.sav')

print(nb_training_score)
print(nb_scores['test_score'].mean())
print(nb_scores['test_score'].std())

In [None]:
dt_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(handle_unknown='ignore')),
    ('clf', DecisionTreeClassifier())
])
dt_pipe.fit(X_train, y_train)
dt_training_score = roc_auc_score(y_train, dt_pipe.predict_proba(X_train)[:, 1])
dt_scores = cross_validate(nb_pipe, X_train, y_train, cv=5, scoring='roc_auc')

joblib.dump(lr_pipe, 'models/decision_tree_pipeline.sav')

print(dt_training_score)
print(dt_scores['test_score'].mean())
print(dt_scores['test_score'].std())

# the confusion matrix for the rf model
dt_cm = confusion_matrix(y_test,xgb_predicts)
dt_cm_df = pd.DataFrame(
    dt_cm, 
    index = ['True','False'],
    columns = [col for col in ['CD (pred)', 'No CD (pred)']])
plt.figure(figsize = (10,7))
sns.heatmap(dt_cm_df, annot=True)
plt.show()
dt_scores