In [1]:
import pandas as pd
import numpy as np

from  sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance

In [3]:
all_data_df = pd.read_csv('all_data_df.csv')
all_data_df.index = all_data_df['SEQN']

mental_health_df = all_data_df.loc[:, 'DPQ010':'DPQ100'].dropna(how='all')
all_data_df = all_data_df.loc[mental_health_df.index]

def mh(x):
    if x == '\.':
        return 'missing'
    elif x == 1:
        return 'several days'
    elif x == 2:
        return 'more than half the days'
    elif x == 3:
        return 'nearly every day'
    elif x == 7:
        return 'refused'
    elif x == 9:
        return "don't know"
    else:
        return 'not at all'

for col in mental_health_df.columns:
    mental_health_df[col] = mental_health_df[col].apply(lambda x: mh(x))
    
def calc(row):
    sum = 0
    for i in ['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 
              'DPQ050', 'DPQ060', 'DPQ070','DPQ080', 
              'DPQ090', 'DPQ100']:
        if row[i] == 'several days':
            sum += 1
        if row[i] == 'more than half the days':
            sum += 2
        if row[i] == 'nearly every day':
            sum += 3
    return sum

mental_health_df['labels_raw'] = mental_health_df.apply(calc, axis=1)
mental_health_df['labels'] = mental_health_df['labels_raw'].apply(lambda x: 1 if x >= 10 else 0)

mental_health_df.to_csv('mental_health_df.csv')
mental_health_df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,DPQ010,DPQ020,DPQ030,DPQ040,DPQ050,DPQ060,DPQ070,DPQ080,DPQ090,DPQ100,labels_raw,labels
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
62161.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
62169.0,several days,not at all,several days,several days,several days,more than half the days,not at all,several days,not at all,not at all,7,0
62172.0,several days,more than half the days,several days,several days,several days,not at all,not at all,not at all,not at all,several days,7,0
62174.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
62176.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
93691.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
93695.0,not at all,several days,not at all,not at all,not at all,not at all,not at all,not at all,not at all,more than half the days,3,0
93696.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
93697.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0


In [25]:
features = [
    #diabetes
    'DIQ010','DID040','DIQ160','DIQ170','DIQ172','DIQ175A','DIQ175B','DIQ175C',
    'DIQ175D','DIQ175E','DIQ175F','DIQ175G','DIQ175H','DIQ175I','DIQ175J','DIQ175K',
    'DIQ175L', 'DIQ175M','DIQ175N','DIQ175O','DIQ175P','DIQ175Q','DIQ175R','DIQ175S',
    'DIQ175T','DIQ175U','DIQ175V','DIQ175W','DIQ180','DIQ050','DID060','DIQ060U',
    'DIQ070','DIQ230','DIQ240','DID250','DID260','DIQ260U','DIQ275','DIQ280','DIQ291',
    'DIQ300S','DIQ300D','DID310S','DID310D','DID320','DID330','DID341','DID350',
    'DIQ350U','DIQ360','DIQ080', 
    #sleep disorder
    'SEQN','SLQ050', 
    #physical activity
    'PAQ605','PAQ610','PAD615','PAQ620','PAQ625','PAD630','PAQ635','PAQ640','PAD645',
    'PAQ650','PAQ655','PAD660','PAQ665','PAQ670','PAD675','PAD680','PAQ706','PAQ710',
    'PAQ715', 
    #weight history
    'WHD010','WHD020','WHQ030','WHQ040','WHD050','WHQ060','WHQ070','WHD080A','WHD080B',
    'WHD080C','WHD080D','WHD080E','WHD080F','WHD080G','WHD080H','WHD080I','WHD080J',
    'WHD080K','WHD080M','WHD080N','WHD080O','WHD080P','WHD080Q','WHD080R','WHD080S',
    'WHD080T','WHD080L','WHD110','WHD120','WHD130','WHD140','WHQ150', 
    #early childhood
    'ECD010','ECQ020','ECD070A','ECD070B','ECQ080','ECQ090','WHQ030E','MCQ080E',
    'ECQ150', 
    #alcohol issues
    'ALQ101','ALQ110','ALQ120Q','ALQ120U','ALQ130','ALQ141Q','ALQ141U','ALQ151',
    #hospital access
    'HUQ010','HUQ020','HUQ030','HUQ071','HUD080','HUQ090', 
    #health status
    'HSD010','HSQ500','HSQ510','HSQ520','HSQ571','HSQ580','HSQ590','HSAQUEX',  
    #income
    #housing
    'SEQN','HOD050','HOQ065', 
    #occupation
    'OCD150','OCQ180','OCQ210','OCQ260','OCD270','OCQ380','OCD390G','OCD395', 
    #mental health
    'DPQ010','DPQ020','DPQ030','DPQ040','DPQ050','DPQ060','DPQ070','DPQ080','DPQ090','DPQ100', 
    #demographic
    #diet nutrition
    'DBQ010','DBD030','DBD041','DBD050','DBD055','DBD061','DBQ073A','DBQ073B','DBQ073C',
    'DBQ073D','DBQ073E','DBQ073U','DBQ700','DBQ197','DBQ223A','DBQ223B','DBQ223C',
    'DBQ223D','DBQ223E','DBQ223U','DBQ229','DBQ235A','DBQ235B','DBQ235C','DBQ301',
    'DBQ330','DBQ360','DBQ370','DBD381','DBQ390','DBQ400','DBD411','DBQ421','DBQ424',
    'DBD895','DBD900','DBD905','DBD910', 
    #drug use
    'DUQ200','DUQ210','DUQ211','DUQ213','DUQ215Q','DUQ215U','DUQ217','DUQ219','DUQ220Q',
    'DUQ220U','DUQ230','DUQ240','DUQ250','DUQ260','DUQ270Q','DUQ270U','DUQ272','DUQ280',
    'DUQ290','DUQ300','DUQ310Q','DUQ310U','DUQ320','DUQ330','DUQ340','DUQ350Q','DUQ350U',
    'DUQ352','DUQ360','DUQ370','DUQ380A','DUQ380B','DUQ380C','DUQ380D','DUQ380E',
    'DUQ390','DUQ400Q','DUQ400U','DUQ410','DUQ420', 
    
]

len(features)

235

In [33]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [34]:
nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('enc', OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('red', PCA(n_components=10)), ('clf', GaussianNB())])

In [35]:
nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.9676028012786014


0.9700446172998809

## Feature Importance

### Round 1

In [36]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)

In [37]:
feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

feature_importances.sort_values('importance_means', ascending = False)

Unnamed: 0,importance_means,importances_std
DPQ100,0.009671,0.002215
DPQ020,0.007608,0.001930
DPQ030,0.006705,0.001048
DPQ010,0.006125,0.001663
DPQ050,0.005932,0.001930
...,...,...
OCD390G,-0.000967,0.000828
HSQ590,-0.001161,0.001520
DIQ170,-0.001418,0.000562
DIQ172,-0.001870,0.000787


In [38]:
remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

128


['DIQ160',
 'DIQ170',
 'DIQ172',
 'DIQ175A',
 'DIQ175B',
 'DIQ175C',
 'DIQ175D',
 'DIQ175E',
 'DIQ175F',
 'DIQ175G',
 'DIQ175H',
 'DIQ175I',
 'DIQ175J',
 'DIQ175K',
 'DIQ175L',
 'DIQ175M',
 'DIQ175N',
 'DIQ175O',
 'DIQ175P',
 'DIQ175Q',
 'DIQ175R',
 'DIQ175S',
 'DIQ175T',
 'DIQ175U',
 'DIQ175V',
 'DIQ175W',
 'DIQ180',
 'DIQ050',
 'DID060',
 'DIQ060U',
 'DIQ240',
 'DIQ260U',
 'DID320',
 'DID330',
 'DID350',
 'SEQN',
 'PAQ605',
 'PAQ610',
 'PAD630',
 'PAQ706',
 'WHD020',
 'WHD050',
 'WHD080A',
 'WHD080B',
 'WHD080C',
 'WHD080D',
 'WHD080E',
 'WHD080F',
 'WHD080G',
 'WHD080H',
 'WHD080I',
 'WHD080J',
 'WHD080K',
 'WHD080M',
 'WHD080N',
 'WHD080O',
 'WHD080P',
 'WHD080Q',
 'WHD080R',
 'WHD080S',
 'WHD080T',
 'WHD080L',
 'WHD110',
 'WHD140',
 'WHQ150',
 'ECD010',
 'ECQ020',
 'ECD070A',
 'ECD070B',
 'ECQ080',
 'ECQ090',
 'WHQ030E',
 'MCQ080E',
 'ECQ150',
 'HSQ500',
 'HSQ571',
 'HSQ580',
 'HSAQUEX',
 'SEQN',
 'OCD150',
 'OCQ180',
 'OCQ210',
 'OCD390G',
 'DBQ010',
 'DBD030',
 'DBD041',
 'DBD05

In [39]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

90


['DIQ010',
 'DID040',
 'DID260',
 'DIQ275',
 'DIQ291',
 'DID310S',
 'DID310D',
 'DID341',
 'DIQ350U',
 'DIQ360',
 'DIQ080',
 'SLQ050',
 'PAQ620',
 'PAQ625',
 'PAQ635',
 'PAQ640',
 'PAD645',
 'PAQ650',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'PAD680',
 'PAQ710',
 'PAQ715',
 'WHD010',
 'WHQ030',
 'WHQ040',
 'WHQ070',
 'WHD120',
 'WHD130',
 'ALQ101',
 'ALQ120U',
 'ALQ130',
 'ALQ141Q',
 'ALQ141U',
 'ALQ151',
 'HUQ010',
 'HUQ020',
 'HUQ030',
 'HUQ071',
 'HUQ090',
 'HSD010',
 'HSQ510',
 'HSQ520',
 'HSQ590',
 'HOD050',
 'HOQ065',
 'OCQ260',
 'OCQ380',
 'OCD395',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ090',
 'DPQ100',
 'DBQ700',
 'DBQ197',
 'DBQ229',
 'DBQ235B',
 'DBQ235C',
 'DBD895',
 'DBD900',
 'DBD905',
 'DUQ200',
 'DUQ210',
 'DUQ211',
 'DUQ213',
 'DUQ215Q',
 'DUQ215U',
 'DUQ217',
 'DUQ219',
 'DUQ220Q',
 'DUQ230',
 'DUQ240',
 'DUQ260',
 'DUQ270Q',
 'DUQ272',
 'DUQ330',
 'DUQ340',
 'DUQ350Q',
 'DUQ370',
 'DUQ390',
 'DUQ400Q',
 'DUQ410',
 'DUQ42

In [40]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.9692019203304444


0.971734804220015

### Round 2

In [41]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

20


['DID040',
 'DID260',
 'DIQ275',
 'DIQ291',
 'DID310S',
 'DID310D',
 'DID341',
 'DIQ350U',
 'DIQ360',
 'DIQ080',
 'WHD120',
 'HSQ520',
 'DUQ230',
 'DUQ340',
 'DUQ350Q',
 'DUQ370',
 'DUQ390',
 'DUQ400Q',
 'DUQ410',
 'DUQ420']

In [32]:
print(features)

['DIQ010', 'DID040', 'DIQ160', 'DIQ170', 'DIQ172', 'DIQ175A', 'DIQ175B', 'DIQ175C', 'DIQ175D', 'DIQ175E', 'DIQ175F', 'DIQ175G', 'DIQ175H', 'DIQ175I', 'DIQ175J', 'DIQ175K', 'DIQ175L', 'DIQ175M', 'DIQ175N', 'DIQ175O', 'DIQ175P', 'DIQ175Q', 'DIQ175R', 'DIQ175S', 'DIQ175T', 'DIQ175U', 'DIQ175V', 'DIQ175W', 'DIQ180', 'DIQ050', 'DID060', 'DIQ060U', 'DIQ240', 'DID260', 'DIQ260U', 'DIQ275', 'DIQ291', 'DID310S', 'DID310D', 'DID320', 'DID330', 'DID341', 'DID350', 'DIQ350U', 'DIQ360', 'DIQ080', 'SEQN', 'SLQ050', 'PAQ605', 'PAQ610', 'PAQ620', 'PAQ625', 'PAD630', 'PAQ635', 'PAQ640', 'PAD645', 'PAQ650', 'PAQ665', 'PAQ670', 'PAD675', 'PAD680', 'PAQ706', 'PAQ710', 'PAQ715', 'WHD010', 'WHD020', 'WHQ030', 'WHQ040', 'WHD050', 'WHQ070', 'WHD080A', 'WHD080B', 'WHD080C', 'WHD080D', 'WHD080E', 'WHD080F', 'WHD080G', 'WHD080H', 'WHD080I', 'WHD080J', 'WHD080K', 'WHD080M', 'WHD080N', 'WHD080O', 'WHD080P', 'WHD080Q', 'WHD080R', 'WHD080S', 'WHD080T', 'WHD080L', 'WHD110', 'WHD120', 'WHD130', 'WHD140', 'WHQ150', 'EC

In [42]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

70


['DIQ010',
 'SLQ050',
 'PAQ620',
 'PAQ625',
 'PAQ635',
 'PAQ640',
 'PAD645',
 'PAQ650',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'PAD680',
 'PAQ710',
 'PAQ715',
 'WHD010',
 'WHQ030',
 'WHQ040',
 'WHQ070',
 'WHD130',
 'ALQ101',
 'ALQ120U',
 'ALQ130',
 'ALQ141Q',
 'ALQ141U',
 'ALQ151',
 'HUQ010',
 'HUQ020',
 'HUQ030',
 'HUQ071',
 'HUQ090',
 'HSD010',
 'HSQ510',
 'HSQ590',
 'HOD050',
 'HOQ065',
 'OCQ260',
 'OCQ380',
 'OCD395',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ090',
 'DPQ100',
 'DBQ700',
 'DBQ197',
 'DBQ229',
 'DBQ235B',
 'DBQ235C',
 'DBD895',
 'DBD900',
 'DBD905',
 'DUQ200',
 'DUQ210',
 'DUQ211',
 'DUQ213',
 'DUQ215Q',
 'DUQ215U',
 'DUQ217',
 'DUQ219',
 'DUQ220Q',
 'DUQ240',
 'DUQ260',
 'DUQ270Q',
 'DUQ272',
 'DUQ330']

In [22]:
# testing python to see if remove works properly
l = ['hello', 'hi', 'howdy']
print(l)
r = ['hello']
print(r)

for i in r:
    l.remove(i)

print(l)

['hello', 'hi', 'howdy']
['hello']
['hi', 'howdy']


In [43]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.9688420508991497


0.9720755677119776

### Round 3

In [44]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

19


['PAQ650',
 'PAQ710',
 'WHQ070',
 'ALQ130',
 'HUQ010',
 'HUQ020',
 'HUQ071',
 'HSD010',
 'HSQ510',
 'HOD050',
 'OCQ260',
 'OCQ380',
 'OCD395',
 'DPQ090',
 'DBQ235C',
 'DUQ220Q',
 'DUQ240',
 'DUQ260',
 'DUQ272']

In [45]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

51


['DIQ010',
 'SLQ050',
 'PAQ620',
 'PAQ625',
 'PAQ635',
 'PAQ640',
 'PAD645',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'PAD680',
 'PAQ715',
 'WHD010',
 'WHQ030',
 'WHQ040',
 'WHD130',
 'ALQ101',
 'ALQ120U',
 'ALQ141Q',
 'ALQ141U',
 'ALQ151',
 'HUQ030',
 'HUQ090',
 'HSQ590',
 'HOQ065',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ100',
 'DBQ700',
 'DBQ197',
 'DBQ229',
 'DBQ235B',
 'DBD895',
 'DBD900',
 'DBD905',
 'DUQ200',
 'DUQ210',
 'DUQ211',
 'DUQ213',
 'DUQ215Q',
 'DUQ215U',
 'DUQ217',
 'DUQ219',
 'DUQ270Q',
 'DUQ330']

In [46]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.9701043343834325


0.9729979008968894

### Round 4

In [47]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

7


['PAQ625', 'PAD645', 'WHD010', 'DUQ210', 'DUQ215Q', 'DUQ215U', 'DUQ217']

In [48]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

44


['DIQ010',
 'SLQ050',
 'PAQ620',
 'PAQ635',
 'PAQ640',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'PAD680',
 'PAQ715',
 'WHQ030',
 'WHQ040',
 'WHD130',
 'ALQ101',
 'ALQ120U',
 'ALQ141Q',
 'ALQ141U',
 'ALQ151',
 'HUQ030',
 'HUQ090',
 'HSQ590',
 'HOQ065',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ100',
 'DBQ700',
 'DBQ197',
 'DBQ229',
 'DBQ235B',
 'DBD895',
 'DBD900',
 'DBD905',
 'DUQ200',
 'DUQ211',
 'DUQ213',
 'DUQ219',
 'DUQ270Q',
 'DUQ330']

In [49]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.9689295379538754


0.9719392623151926

### Round 5

In [50]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

4


['PAD680', 'WHQ030', 'WHQ040', 'DBQ197']

In [51]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

40


['DIQ010',
 'SLQ050',
 'PAQ620',
 'PAQ635',
 'PAQ640',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'PAQ715',
 'WHD130',
 'ALQ101',
 'ALQ120U',
 'ALQ141Q',
 'ALQ141U',
 'ALQ151',
 'HUQ030',
 'HUQ090',
 'HSQ590',
 'HOQ065',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ100',
 'DBQ700',
 'DBQ229',
 'DBQ235B',
 'DBD895',
 'DBD900',
 'DBD905',
 'DUQ200',
 'DUQ211',
 'DUQ213',
 'DUQ219',
 'DUQ270Q',
 'DUQ330']

In [52]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.9712506853788595


0.9759193799013149

### Round 6

In [53]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

5


['PAQ620', 'ALQ141Q', 'HSQ590', 'DBQ700', 'DUQ270Q']

In [54]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

35


['DIQ010',
 'SLQ050',
 'PAQ635',
 'PAQ640',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'PAQ715',
 'WHD130',
 'ALQ101',
 'ALQ120U',
 'ALQ141U',
 'ALQ151',
 'HUQ030',
 'HUQ090',
 'HOQ065',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ100',
 'DBQ229',
 'DBQ235B',
 'DBD895',
 'DBD900',
 'DBD905',
 'DUQ200',
 'DUQ211',
 'DUQ213',
 'DUQ219',
 'DUQ330']

In [55]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.980722390532083


0.9848337528510546

### Round 7

In [56]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

11


['PAQ715',
 'ALQ101',
 'ALQ120U',
 'ALQ141U',
 'DBD895',
 'DBD900',
 'DBD905',
 'DUQ200',
 'DUQ211',
 'DUQ213',
 'DUQ219']

In [57]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

24


['DIQ010',
 'SLQ050',
 'PAQ635',
 'PAQ640',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'WHD130',
 'ALQ151',
 'HUQ030',
 'HUQ090',
 'HOQ065',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ100',
 'DBQ229',
 'DBQ235B',
 'DUQ330']

In [58]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.9805114780448774


0.9838977891264642

### Round 8

In [59]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

1


['DIQ010']

In [60]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

23


['SLQ050',
 'PAQ635',
 'PAQ640',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'WHD130',
 'ALQ151',
 'HUQ030',
 'HUQ090',
 'HOQ065',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ100',
 'DBQ229',
 'DBQ235B',
 'DUQ330']

In [61]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.981761828322647


0.9847610566394359

### Round 9

In [62]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

3


['WHD130', 'HUQ030', 'HOQ065']

In [63]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

20


['SLQ050',
 'PAQ635',
 'PAQ640',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'ALQ151',
 'HUQ090',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ100',
 'DBQ229',
 'DBQ235B',
 'DUQ330']

In [64]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.9832152512484598


0.9857333684698356

### Round 10

In [65]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

2


['HUQ090', 'DUQ330']

In [66]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

18


['SLQ050',
 'PAQ635',
 'PAQ640',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'ALQ151',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ100',
 'DBQ229',
 'DBQ235B']

In [67]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.9843817221851006


0.9866647886811999

### Round 11

In [68]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

0


[]

In [69]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

18


['SLQ050',
 'PAQ635',
 'PAQ640',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'ALQ151',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ100',
 'DBQ229',
 'DBQ235B']

In [70]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.9843815140477776


0.9866647886811999

### Round 12

In [72]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

0


[]

In [73]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

18


['SLQ050',
 'PAQ635',
 'PAQ640',
 'PAQ665',
 'PAQ670',
 'PAD675',
 'ALQ151',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ100',
 'DBQ229',
 'DBQ235B']

In [74]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.9843819303224235


0.9866647886811999