In [3]:
import pandas as pd
import numpy as np

from  sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [4]:
all_data_df = pd.read_csv('https://raw.githubusercontent.com/albemlee/ai4all_nhanes/main/notebooks/all_data_df.csv')
all_data_df.index = all_data_df['SEQN']
all_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 29902 entries, 62161.0 to 93698.0
Columns: 674 entries, SEQN to WHQ150
dtypes: float64(672), object(2)
memory usage: 154.0+ MB


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Create Label

In [5]:
# Select mental health screener columns and drop data that are empty
mental_health_df = all_data_df.loc[:, 'DPQ010':'DPQ100'].dropna(how='all')
all_data_df = all_data_df.loc[mental_health_df.index]

In [6]:
def mh(x):
    if x == '\.':
        return 'missing'
    elif x == 1:
        return 'several days'
    elif x == 2:
        return 'more than half the days'
    elif x == 3:
        return 'nearly every day'
    elif x == 7:
        return 'refused'
    elif x == 9:
        return "don't know"
    else:
        return 'not at all'

for col in mental_health_df.columns:
    mental_health_df[col] = mental_health_df[col].apply(lambda x: mh(x))

mental_health_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 15513 entries, 62161.0 to 93702.0
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   DPQ010  15513 non-null  object
 1   DPQ020  15513 non-null  object
 2   DPQ030  15513 non-null  object
 3   DPQ040  15513 non-null  object
 4   DPQ050  15513 non-null  object
 5   DPQ060  15513 non-null  object
 6   DPQ070  15513 non-null  object
 7   DPQ080  15513 non-null  object
 8   DPQ090  15513 non-null  object
 9   DPQ100  15513 non-null  object
dtypes: object(10)
memory usage: 1.3+ MB


In [7]:
def calc(row):
    sum = 0
    for i in ['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 
              'DPQ050', 'DPQ060', 'DPQ070','DPQ080', 
              'DPQ090', 'DPQ100']:
        if row[i] == 'several days':
            sum += 1
        if row[i] == 'more than half the days':
            sum += 2
        if row[i] == 'nearly every day':
            sum += 3
    return sum

In [8]:
# Use 10 as threshold for depression
mental_health_df['labels_raw'] = mental_health_df.apply(calc, axis=1)
mental_health_df['labels'] = mental_health_df['labels_raw'].apply(lambda x: 1 if x >= 10 else 0)

# Select Features

In [9]:
features = [
    #sleep disorder
    'SEQN','SLQ050', 
    #early childhood
    'ECD010','ECQ020','ECD070A','ECD070B','ECQ080','ECQ090','WHQ030E','MCQ080E',
    'ECQ150', 
    #alcohol issues
    'ALQ101','ALQ110','ALQ120Q','ALQ120U','ALQ130','ALQ141Q','ALQ141U','ALQ151',
    #health status
    'HSD010','HSQ500','HSQ510','HSQ520','HSQ571','HSQ580','HSQ590','HSAQUEX',
    #hospital access
    'HUQ010','HUQ020','HUQ030','HUQ071','HUD080','HUQ090', 
 
]

final_features = []

for feature in features:
    if feature not in final_features and feature != 'SEQN':
        final_features.append(feature)
        
print (len(final_features))

32


# Split Data

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [10]:
X = all_data_df[final_features]
y = mental_health_df['labels']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Train Model

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

Gradient Boosting Classifier

In [12]:
gdclass_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                       max_depth=1, random_state=42))
])
gdclass_pipe.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('enc', OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('red', PCA(n_components=10)),
                ('clf',
                 GradientBoostingClassifier(learning_rate=1.0, max_depth=1,
                                            random_state=42))])

# Model Evaluation

In [13]:
gdclass_training_score = roc_auc_score(y_train.values, gdclass_pipe.predict_proba(X_train)[:,1])
gdclass_validation_score = roc_auc_score(y_val.values, gdclass_pipe.predict_proba(X_val)[:,1])

In [14]:
pd.DataFrame.from_dict(
    {
        'gradient boosting classifier': {
            'training score': gdclass_training_score,
            'validation score': gdclass_validation_score
        }, 
    }, 
    orient='index')

Unnamed: 0,training score,validation score
gradient boosting classifier,0.835242,0.810376


# Feature Importance

https://scikit-learn.org/stable/modules/permutation_importance.html

In [15]:
r = permutation_importance(
    gdclass_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)

In [16]:
feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

In [17]:
feature_importances.sort_values('importance_means', ascending = False)

Unnamed: 0,importance_means,importances_std
HUQ020,0.002579,0.002099
SLQ050,0.002257,0.002347
HUQ010,0.002257,0.00194
HSQ510,0.002063,0.00099
ALQ151,0.001934,0.002307
HSQ590,0.001741,0.001191
HUQ090,0.001741,0.001081
HSQ500,0.001547,0.00231
ALQ120Q,0.001096,0.001825
ALQ141U,0.001032,0.001359


# Hyperparameter Tuning

- https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
- https://medium.com/@kocur4d/hyper-parameter-tuning-with-pipelines-5310aff069d6

In [24]:
grid_params = {
  'clf__learning_rate': [0.0024, 0.029, 0.1],
  'clf__n_estimators': [8, 109, 1000],   
}

In [25]:
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=42)
grid_search = GridSearchCV(
    estimator=gdclass_pipe, 
    param_grid=grid_params, 
    n_jobs=-1, 
    cv=cv, 
    scoring='roc_auc',
    error_score=0
)
grid_result = grid_search.fit(X_train, y_train)


In [26]:
grid_result.best_score_

0.8013721886759293

In [27]:
grid_result.best_params_

{'clf__learning_rate': 0.1, 'clf__n_estimators': 1000}

In [28]:
gdclass_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                       max_depth=1, random_state=42))
])
gdclass_pipe.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('enc', OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('red', PCA(n_components=10)),
                ('clf',
                 GradientBoostingClassifier(learning_rate=1.0, max_depth=1,
                                            random_state=42))])

In [29]:
gdclass_training_score = roc_auc_score(y_train.values, gdclass_pipe.predict_proba(X_train)[:, 1])
gdclass_validation_score = roc_auc_score(y_val.values, gdclass_pipe.predict_proba(X_val)[:, 1])
print(gdclass_training_score)
gdclass_validation_score

0.83514840642087


0.8056080583750578

## Repeat

In [50]:
grid_params = {
  'clf__learning_rate': [0.0028, 0.0314, 0.1],
  'clf__n_estimators': [7.8, 103, 1000],   
}

In [51]:
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=42)
grid_search = GridSearchCV(
    estimator=gdclass_pipe, 
    param_grid=grid_params, 
    n_jobs=-1, 
    cv=cv, 
    scoring='roc_auc',
    error_score=0
)
grid_result = grid_search.fit(X_train, y_train)


In [52]:
grid_result.best_score_

0.801234920351984

In [53]:
grid_result.best_params_

{'clf__learning_rate': 0.1, 'clf__n_estimators': 1000}

In [54]:
gdclass_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                       max_depth=1, random_state=42))
])
gdclass_pipe.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('enc', OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('red', PCA(n_components=10)),
                ('clf',
                 GradientBoostingClassifier(learning_rate=1.0, max_depth=1,
                                            random_state=42))])

In [55]:
gdclass_training_score = roc_auc_score(y_train.values, gdclass_pipe.predict_proba(X_train)[:, 1])
gdclass_validation_score = roc_auc_score(y_val.values, gdclass_pipe.predict_proba(X_val)[:, 1])
print(gdclass_training_score)
gdclass_validation_score

0.8353305959575154


0.8122643052513926