In [1]:
import pandas as pd
import numpy as np

from  sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance

## Import + format data- and save it this time

In [2]:
all_data_df = pd.read_csv('all_data_df.csv')
all_data_df.index = all_data_df['SEQN']

mental_health_df = all_data_df.loc[:, 'DPQ010':'DPQ100'].dropna(how='all')
all_data_df = all_data_df.loc[mental_health_df.index]

def mh(x):
    if x == '\.':
        return 'missing'
    elif x == 1:
        return 'several days'
    elif x == 2:
        return 'more than half the days'
    elif x == 3:
        return 'nearly every day'
    elif x == 7:
        return 'refused'
    elif x == 9:
        return "don't know"
    else:
        return 'not at all'

for col in mental_health_df.columns:
    mental_health_df[col] = mental_health_df[col].apply(lambda x: mh(x))
    
def calc(row):
    sum = 0
    for i in ['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 
              'DPQ050', 'DPQ060', 'DPQ070','DPQ080', 
              'DPQ090', 'DPQ100']:
        if row[i] == 'several days':
            sum += 1
        if row[i] == 'more than half the days':
            sum += 2
        if row[i] == 'nearly every day':
            sum += 3
    return sum

mental_health_df['labels_raw'] = mental_health_df.apply(calc, axis=1)
mental_health_df['labels'] = mental_health_df['labels_raw'].apply(lambda x: 1 if x >= 10 else 0)

mental_health_df.to_csv('mental_health_df.csv')
mental_health_df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,DPQ010,DPQ020,DPQ030,DPQ040,DPQ050,DPQ060,DPQ070,DPQ080,DPQ090,DPQ100,labels_raw,labels
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
62161.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
62169.0,several days,not at all,several days,several days,several days,more than half the days,not at all,several days,not at all,not at all,7,0
62172.0,several days,more than half the days,several days,several days,several days,not at all,not at all,not at all,not at all,several days,7,0
62174.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
62176.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
93691.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
93695.0,not at all,several days,not at all,not at all,not at all,not at all,not at all,not at all,not at all,more than half the days,3,0
93696.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
93697.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0


In [35]:
features = [
    #diabetes
    'DIQ170','DIQ175A','DIQ175B','DIQ175C',
    'DIQ175D','DIQ175E','DIQ175F','DIQ175G','DIQ175H','DIQ175I','DIQ175J','DIQ175K',
    'DIQ175L', 'DIQ175M','DIQ175N','DIQ175O','DIQ175P','DIQ175Q','DIQ175R','DIQ175S',
    'DIQ175T','DIQ175U','DIQ175V','DIQ175W',
    'DIQ260U',
    #sleep disorder
    #physical activity
    'PAQ706',
    #weight history
    'WHD080A','WHD080B',
    'WHD080C','WHD080D','WHD080E','WHD080F','WHD080G','WHD080H','WHD080I','WHD080J',
    'WHD080K','WHD080M','WHD080N','WHD080O','WHD080P','WHD080Q','WHD080R','WHD080S',
    'WHD080T','WHD080L',
    #early childhood
    'ECD010','ECQ020','ECD070A','ECD070B','ECQ080','ECQ090','WHQ030E','MCQ080E',
    'ECQ150', 
    #alcohol issues
    #early childhood
    #hospital access
    'HUQ071','HUQ090', 
    #health status
    'HSQ520','HSAQUEX',  
    #income
    #housing
    #occupation
    #diet nutrition
    'DBQ010','DBD030','DBD041','DBD050','DBD055','DBD061','DBQ073A','DBQ073B','DBQ073C',
    'DBQ073D','DBQ073E','DBQ073U','DBQ700','DBQ223A','DBQ223B','DBQ223C',
    'DBQ223D','DBQ223E','DBQ223U',
    'DBQ330','DBQ390','DBQ400','DBD411','DBQ421',
    #drug use
    'DUQ250','DUQ320',
    'DUQ380A','DUQ380B','DUQ380C','DUQ380D','DUQ380E',
]

print(len(features))

90


In [36]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [37]:
randfor_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', RandomForestClassifier(max_depth=7, n_estimators=1000, random_state=42))
])
randfor_pipe.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('enc', OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('red', PCA(n_components=10)),
                ('clf',
                 RandomForestClassifier(max_depth=7, n_estimators=1000,
                                        random_state=42))])

In [38]:
randfor_training_score = roc_auc_score(y_train.values, randfor_pipe.predict_proba(X_train)[:, 1])
randfor_validation_score = roc_auc_score(y_val.values, randfor_pipe.predict_proba(X_val)[:, 1])
print(randfor_training_score)
randfor_validation_score

0.7443638667779601


0.7050442083836906

## Feature Importance

### Round 1

In [39]:
r = permutation_importance(
    randfor_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)

In [40]:
feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

feature_importances.sort_values('importance_means', ascending = False)

Unnamed: 0,importance_means,importances_std
HUQ090,0.008640,0.001230
DBQ700,0.007286,0.001658
HUQ071,0.004191,0.000967
DIQ170,0.001999,0.001096
HSQ520,0.001805,0.000948
...,...,...
WHD080D,0.000000,0.000000
WHD080C,0.000000,0.000000
WHD080B,0.000000,0.000000
WHD080A,0.000000,0.000000


In [41]:
remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

78


['DIQ175A',
 'DIQ175B',
 'DIQ175C',
 'DIQ175D',
 'DIQ175E',
 'DIQ175F',
 'DIQ175G',
 'DIQ175H',
 'DIQ175I',
 'DIQ175J',
 'DIQ175K',
 'DIQ175L',
 'DIQ175M',
 'DIQ175N',
 'DIQ175O',
 'DIQ175P',
 'DIQ175Q',
 'DIQ175R',
 'DIQ175S',
 'DIQ175T',
 'DIQ175U',
 'DIQ175V',
 'DIQ175W',
 'PAQ706',
 'WHD080A',
 'WHD080B',
 'WHD080C',
 'WHD080D',
 'WHD080E',
 'WHD080F',
 'WHD080G',
 'WHD080H',
 'WHD080I',
 'WHD080J',
 'WHD080K',
 'WHD080M',
 'WHD080N',
 'WHD080O',
 'WHD080P',
 'WHD080Q',
 'WHD080R',
 'WHD080S',
 'WHD080T',
 'WHD080L',
 'ECD010',
 'ECQ020',
 'ECD070A',
 'ECD070B',
 'ECQ080',
 'ECQ090',
 'WHQ030E',
 'MCQ080E',
 'ECQ150',
 'HSAQUEX',
 'DBQ010',
 'DBD030',
 'DBD041',
 'DBD050',
 'DBD055',
 'DBD061',
 'DBQ073A',
 'DBQ073B',
 'DBQ073C',
 'DBQ073D',
 'DBQ073E',
 'DBQ073U',
 'DBQ223A',
 'DBQ223B',
 'DBQ223C',
 'DBQ223D',
 'DBQ223E',
 'DBQ223U',
 'DBQ400',
 'DUQ380A',
 'DUQ380B',
 'DUQ380C',
 'DUQ380D',
 'DUQ380E']

In [42]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

12


['DIQ170',
 'DIQ260U',
 'HUQ071',
 'HUQ090',
 'HSQ520',
 'DBQ700',
 'DBQ330',
 'DBQ390',
 'DBD411',
 'DBQ421',
 'DUQ250',
 'DUQ320']

In [45]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


rf_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', RandomForestClassifier(max_depth=7, n_estimators=1000, random_state=42))
])
rf_pipe.fit(X_train, y_train)


rf_training_score = roc_auc_score(y_train.values, rf_pipe.predict_proba(X_train)[:, 1])
rf_validation_score = roc_auc_score(y_val.values, rf_pipe.predict_proba(X_val)[:, 1])
print(rf_training_score)
rf_validation_score

0.7438470618051987


0.7027065708288276

### Round 2

In [46]:
r = permutation_importance(
    rf_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

4


['DBQ390', 'DBD411', 'DBQ421', 'DUQ320']

In [47]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

8


['DIQ170',
 'DIQ260U',
 'HUQ071',
 'HUQ090',
 'HSQ520',
 'DBQ700',
 'DBQ330',
 'DUQ250']

In [48]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


rf_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', RandomForestClassifier(max_depth=7, n_estimators=1000, random_state=42))
])
rf_pipe.fit(X_train, y_train)


rf_training_score = roc_auc_score(y_train.values, rf_pipe.predict_proba(X_train)[:, 1])
rf_validation_score = roc_auc_score(y_val.values, rf_pipe.predict_proba(X_val)[:, 1])
print(rf_training_score)
rf_validation_score

0.7421677058153777


0.7034130871354967

### Round 3

In [49]:
r = permutation_importance(
    rf_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

0


[]

In [50]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

8


['DIQ170',
 'DIQ260U',
 'HUQ071',
 'HUQ090',
 'HSQ520',
 'DBQ700',
 'DBQ330',
 'DUQ250']

In [51]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


rf_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', RandomForestClassifier(max_depth=7, n_estimators=1000, random_state=42))
])
rf_pipe.fit(X_train, y_train)


rf_training_score = roc_auc_score(y_train.values, rf_pipe.predict_proba(X_train)[:, 1])
rf_validation_score = roc_auc_score(y_val.values, rf_pipe.predict_proba(X_val)[:, 1])
print(rf_training_score)
rf_validation_score

0.7421677058153777


0.7034130871354967

### Round 4

In [52]:
r = permutation_importance(
    rf_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

0


[]

In [53]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

8


['DIQ170',
 'DIQ260U',
 'HUQ071',
 'HUQ090',
 'HSQ520',
 'DBQ700',
 'DBQ330',
 'DUQ250']

In [54]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


rf_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', RandomForestClassifier(max_depth=7, n_estimators=1000, random_state=42))
])
rf_pipe.fit(X_train, y_train)


rf_training_score = roc_auc_score(y_train.values, rf_pipe.predict_proba(X_train)[:, 1])
rf_validation_score = roc_auc_score(y_val.values, rf_pipe.predict_proba(X_val)[:, 1])
print(rf_training_score)
rf_validation_score

0.7421677058153777


0.7034130871354967