In [1]:
#default
import pandas as pd
import numpy as np

#pipeline for convenience
from sklearn.pipeline import Pipeline

#pre-formatting
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

#models
from sklearn.dummy import DummyClassifier

#evaluating models
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance

#evaluating features
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

#ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
all_data_df = pd.read_csv('all_data_df.csv')
all_data_df.index = all_data_df['SEQN']
all_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 29902 entries, 62161.0 to 93698.0
Columns: 674 entries, SEQN to WHQ150
dtypes: float64(672), object(2)
memory usage: 154.0+ MB


## Create Label

In [3]:
mental_health_df = all_data_df.loc[:, 'DPQ010':'DPQ100'].dropna(how='all')
all_data_df = all_data_df.loc[mental_health_df.index]

In [4]:
def mh(x):
    if x == '\.':
        return 'missing'
    elif x == 1:
        return 'several days'
    elif x == 2:
        return 'more than half the days'
    elif x == 3:
        return 'nearly every day'
    elif x == 7:
        return 'refused'
    elif x == 9:
        return "don't know"
    else:
        return 'not at all'

for col in mental_health_df.columns:
    mental_health_df[col] = mental_health_df[col].apply(lambda x: mh(x))

In [5]:
def calc(row):
    sum = 0
    for i in ['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 
              'DPQ050', 'DPQ060', 'DPQ070','DPQ080', 
              'DPQ090', 'DPQ100']:
        if row[i] == 'several days':
            sum += 1
        if row[i] == 'more than half the days':
            sum += 2
        if row[i] == 'nearly every day':
            sum += 3
    return sum

mental_health_df['labels_raw'] = mental_health_df.apply(calc, axis=1)
mental_health_df['labels'] = mental_health_df['labels_raw'].apply(lambda x: 1 if x >= 10 else 0)

mental_health_df.head()

Unnamed: 0_level_0,DPQ010,DPQ020,DPQ030,DPQ040,DPQ050,DPQ060,DPQ070,DPQ080,DPQ090,DPQ100,labels_raw,labels
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
62161.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
62169.0,several days,not at all,several days,several days,several days,more than half the days,not at all,several days,not at all,not at all,7,0
62172.0,several days,more than half the days,several days,several days,several days,not at all,not at all,not at all,not at all,several days,7,0
62174.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
62176.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0


## Select Features

In [6]:
features_ = [
    #diabetes
    'DID040','DIQ160','DIQ170','DIQ172','DIQ175A','DIQ175B','DIQ175C',
    'DIQ175D','DIQ175E','DIQ175F','DIQ175G','DIQ175H','DIQ175I','DIQ175J','DIQ175K',
    'DIQ175L', 'DIQ175M','DIQ175N','DIQ175O','DIQ175P','DIQ175Q','DIQ175R','DIQ175S',
    'DIQ175T','DIQ175U','DIQ175V','DIQ175W','DIQ180','DIQ050','DID060','DIQ060U',
    'DIQ070','DIQ230','DIQ240','DID250','DID260','DIQ260U','DIQ275','DIQ280','DIQ291',
    'DIQ300S','DIQ300D','DID310S','DID310D','DID320','DID330','DID341','DID350',
    'DIQ350U','DIQ360','DIQ080', 
    #sleep disorder
    'SLQ050', 
    #physical activity
    'PAQ605','PAQ610','PAD615','PAQ620','PAQ625','PAD630','PAQ635','PAQ640','PAD645',
    'PAQ650','PAQ655','PAD660','PAQ665','PAQ670','PAD675','PAD680','PAQ706','PAQ710',
    'PAQ715', 
    #weight history
    'WHD010','WHD020','WHD050','WHQ060','WHD080A','WHD080B',
    'WHD080C','WHD080D','WHD080E','WHD080F','WHD080G','WHD080H','WHD080I','WHD080J',
    'WHD080K','WHD080M','WHD080N','WHD080O','WHD080P','WHD080Q','WHD080R','WHD080S',
    'WHD080T','WHD080L','WHD110','WHD120','WHD130','WHD140','WHQ150', 
    #early childhood
    'ECD010','ECQ020','ECD070A','ECD070B','ECQ080','ECQ090','WHQ030E','MCQ080E',
    'ECQ150', 
    #alcohol issues
    'ALQ101','ALQ110','ALQ120Q','ALQ120U','ALQ141Q','ALQ141U','ALQ151',
    #early childhood
    'ECD010','ECQ020','ECD070A','ECD070B','ECQ080','ECQ090','WHQ030E','MCQ080E','ECQ150', 
    #hospital access
    'HUQ010','HUQ020','HUQ030','HUQ071','HUD080','HUQ090', 
    #health status
    'HSD010','HSQ500','HSQ510','HSQ520','HSQ571','HSQ580','HSQ590','HSAQUEX',  
    #income
    'INQ012','INQ030','INQ060','INQ080','INQ090','INQ132','INQ150',
    'IND235','INDFMMPI','INDFMMPC', 
    #housing
    'HOD050','HOQ065', 
    #occupation
    'OCD150','OCQ180','OCQ210','OCQ260','OCD270','OCQ380','OCD390G','OCD395', 
    #diet nutrition
    'DBQ010','DBD030','DBD041','DBD050','DBD055','DBD061','DBQ073A','DBQ073B','DBQ073C',
    'DBQ073D','DBQ073E','DBQ073U','DBQ700','DBQ197','DBQ223A','DBQ223B','DBQ223C',
    'DBQ223D','DBQ223E','DBQ223U','DBQ229','DBQ235A','DBQ235B','DBQ235C','DBQ301',
    'DBQ330','DBQ360','DBQ370','DBD381','DBQ390','DBQ400','DBD411','DBQ421','DBQ424',
    'DBD895','DBD900','DBD905','DBD910', 
    #drug use
    'DUQ200','DUQ210','DUQ211','DUQ213','DUQ215Q','DUQ215U','DUQ217','DUQ219','DUQ220Q',
    'DUQ220U','DUQ230','DUQ240','DUQ250','DUQ260','DUQ270Q','DUQ270U','DUQ272','DUQ280',
    'DUQ290','DUQ300','DUQ310Q','DUQ310U','DUQ320','DUQ330','DUQ340','DUQ350Q','DUQ350U',
    'DUQ352','DUQ360','DUQ370','DUQ380A','DUQ380B','DUQ380C','DUQ380D','DUQ380E',
    'DUQ390','DUQ400Q','DUQ400U','DUQ410','DUQ420', 
    
]

features = []

for feature in features_:
    if feature not in features and feature != 'SEQN':
        features.append(feature)

## Split Data

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = all_data_df[features]
y = mental_health_df['labels']
print(X.shape)
y.shape

(15513, 228)


(15513,)

In [9]:
# train: 80%
# test: 10%
# val: 10%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=55)

## Dummy Classifier

In [10]:
dummy_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', DummyClassifier(strategy='stratified', random_state=55))
])
dummy_pipe.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('enc', OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('red', PCA(n_components=10)),
                ('clf',
                 DummyClassifier(random_state=55, strategy='stratified'))])

In [11]:
dummy_training_score = roc_auc_score(y_train.values, dummy_pipe.predict_proba(X_train)[:, 1])
dummy_validation_score = roc_auc_score(y_val.values, dummy_pipe.predict_proba(X_val)[:, 1])
print(dummy_training_score)
dummy_validation_score

0.5032121476581153


0.5105304456999372

## Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [13]:
rand_for_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', RandomForestClassifier())
])
rand_for_pipe.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('enc', OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('red', PCA(n_components=10)),
                ('clf', RandomForestClassifier())])

In [14]:
randfor_training_score = roc_auc_score(y_train.values, rand_for_pipe.predict_proba(X_train)[:, 1])
randfor_validation_score = roc_auc_score(y_val.values, rand_for_pipe.predict_proba(X_val)[:, 1])
print(randfor_training_score)
randfor_validation_score

1.0


0.7455482318476667

### Hyperparameter Tuning

In [None]:
rand_for_pipe.get_params().keys()

In [None]:
grid_params = {
  'clf__max_depth': [2, 3],  
  'clf__n_estimators': [10, 100, 1000],  
}

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=42)
grid_search = GridSearchCV(
    estimator=rand_for_pipe, 
    param_grid=grid_params, 
    n_jobs=-1, 
    cv=cv, 
    scoring='roc_auc',
    error_score=0
)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
print(grid_result.best_score_)
grid_result.best_params_

### Test the Hyperparameters

In [None]:
rand_for_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', RandomForestClassifier(max_depth=3, n_estimators=1000))
])
rand_for_pipe.fit(X_train, y_train)

In [None]:
randfor_training_score = roc_auc_score(y_train.values, rand_for_pipe.predict_proba(X_train)[:, 1])
randfor_validation_score = roc_auc_score(y_val.values, rand_for_pipe.predict_proba(X_val)[:, 1])
print(randfor_training_score)
randfor_validation_score

### Repeat

In [None]:
grid_params = {
  'clf__max_depth': [4, 5],  
  'clf__n_estimators': [10, 100, 1000],  
}

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=42)
grid_search = GridSearchCV(
    estimator=rand_for_pipe, 
    param_grid=grid_params, 
    n_jobs=-1, 
    cv=cv, 
    scoring='roc_auc',
    error_score=0
)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
print(grid_result.best_score_)
grid_result.best_params_

In [15]:
rand_for_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', RandomForestClassifier(max_depth=5, n_estimators=1000))
])
rand_for_pipe.fit(X_train, y_train)

randfor_training_score = roc_auc_score(y_train.values, rand_for_pipe.predict_proba(X_train)[:, 1])
randfor_validation_score = roc_auc_score(y_val.values, rand_for_pipe.predict_proba(X_val)[:, 1])
print(randfor_training_score)
randfor_validation_score

0.835330369476281


0.7526679221594476

### And Repeat

In [None]:
grid_params = {
  'clf__max_depth': [9, 10],  
  'clf__n_estimators': [100, 1000],  
}

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=42)
grid_search = GridSearchCV(
    estimator=rand_for_pipe, 
    param_grid=grid_params, 
    n_jobs=-1, 
    cv=cv, 
    scoring='roc_auc',
    error_score=0
)
grid_result = grid_search.fit(X_train, y_train)
print(grid_result.best_score_)
grid_result.best_params_

In [None]:
grid_params = {
  'clf__max_depth': [6, 7, 8, 9],  
  'clf__n_estimators': [1000],  
}

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=42)
grid_search = GridSearchCV(
    estimator=rand_for_pipe, 
    param_grid=grid_params, 
    n_jobs=-1, 
    cv=cv, 
    scoring='roc_auc',
    error_score=0
)
grid_result = grid_search.fit(X_train, y_train)
print(grid_result.best_score_)
grid_result.best_params_

In [None]:
grid_params = {
  'clf__max_depth': [5, 7],  
  'clf__n_estimators': [1000],  
}

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=42)
grid_search = GridSearchCV(
    estimator=rand_for_pipe, 
    param_grid=grid_params, 
    n_jobs=-1, 
    cv=cv, 
    scoring='roc_auc',
    error_score=0
)
grid_result = grid_search.fit(X_train, y_train)
print(grid_result.best_score_)
grid_result.best_params_

In [16]:
rand_for_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', RandomForestClassifier(max_depth=7, n_estimators=1000))
])
rand_for_pipe.fit(X_train, y_train)

randfor_training_score = roc_auc_score(y_train.values, rand_for_pipe.predict_proba(X_train)[:, 1])
randfor_validation_score = roc_auc_score(y_val.values, rand_for_pipe.predict_proba(X_val)[:, 1])
print(randfor_training_score)
randfor_validation_score

0.8905683864319413


0.7601276417660597

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
naive_bay_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
naive_bay_pipe.fit(X_train, y_train)

In [None]:
naivebay_training_score = roc_auc_score(y_train.values, naive_bay_pipe.predict_proba(X_train)[:, 1])
naivebay_validation_score = roc_auc_score(y_val.values, naive_bay_pipe.predict_proba(X_val)[:, 1])
print(naivebay_training_score)
naivebay_validation_score

### Hyperparameter Tuning

In [None]:
naive_bay_pipe.get_params().keys()

In [None]:
# not in use
grid_params = {
    'uhh...'
    # I don't quite understand the doc, but I don't think it needs hyperparameter tuning...
    # ...maybe?
}

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=42)
grid_search = GridSearchCV(
    estimator=naive_bay_pipe, 
    param_grid=grid_params, 
    n_jobs=-1, 
    cv=cv, 
    scoring='roc_auc',
    error_score=0
)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
# not in use
print(grid_result.best_score_)
grid_result.best_params_

## ~~Isotonic Regression~~ Our broken hopes and dreams

In [None]:
# not in use
from sklearn.isotonic import IsotonicRegression
from sklearn.datasets import make_regression

In [None]:
# not in use
iso_reg_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', IsotonicRegression())
])
iso_reg_pipe.fit(X_train, y_train)

In [None]:
# not in use
isoreg_training_score = roc_auc_score(y_train.values, iso_reg_pipe.predict_proba(X_train)[:, 1])
isoreg_validation_score = roc_auc_score(y_val.values, iso_reg_pipe.predict_proba(X_val)[:, 1])
print(isoreg_training_score)
isoreg_validation_score

## Reference Links

Ignore warnings - https://stackoverflow.com/questions/14463277/how-to-disable-python-warnings \
What is PCA - https://www.upgrad.com/blog/pca-in-machine-learning/ \
Random Forest doc - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html \
Naive Bayes doc - https://scikit-learn.org/stable/modules/naive_bayes.html \
Gaussian Naive Bayes doc - https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html \
Isotonic Regression doc - https://scikit-learn.org/stable/modules/generated/sklearn.isotonic.IsotonicRegression.html \
Isotonic Regression info - https://scikit-learn.org/stable/modules/isotonic.html#isotonic \
Why we can't use isoreg - **data dimensions are unsuitable**