In [1]:
import pandas as pd
import numpy as np

from  sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance

In [2]:
all_data_df = pd.read_csv('all_data_df.csv')
all_data_df.index = all_data_df['SEQN']

mental_health_df = all_data_df.loc[:, 'DPQ010':'DPQ100'].dropna(how='all')
all_data_df = all_data_df.loc[mental_health_df.index]

def mh(x):
    if x == '\.':
        return 'missing'
    elif x == 1:
        return 'several days'
    elif x == 2:
        return 'more than half the days'
    elif x == 3:
        return 'nearly every day'
    elif x == 7:
        return 'refused'
    elif x == 9:
        return "don't know"
    else:
        return 'not at all'

for col in mental_health_df.columns:
    mental_health_df[col] = mental_health_df[col].apply(lambda x: mh(x))
    
def calc(row):
    sum = 0
    for i in ['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 
              'DPQ050', 'DPQ060', 'DPQ070','DPQ080', 
              'DPQ090', 'DPQ100']:
        if row[i] == 'several days':
            sum += 1
        if row[i] == 'more than half the days':
            sum += 2
        if row[i] == 'nearly every day':
            sum += 3
    return sum

mental_health_df['labels_raw'] = mental_health_df.apply(calc, axis=1)
mental_health_df['labels'] = mental_health_df['labels_raw'].apply(lambda x: 1 if x >= 10 else 0)

mental_health_df.to_csv('mental_health_df.csv')
mental_health_df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,DPQ010,DPQ020,DPQ030,DPQ040,DPQ050,DPQ060,DPQ070,DPQ080,DPQ090,DPQ100,labels_raw,labels
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
62161.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
62169.0,several days,not at all,several days,several days,several days,more than half the days,not at all,several days,not at all,not at all,7,0
62172.0,several days,more than half the days,several days,several days,several days,not at all,not at all,not at all,not at all,several days,7,0
62174.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
62176.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
93691.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
93695.0,not at all,several days,not at all,not at all,not at all,not at all,not at all,not at all,not at all,more than half the days,3,0
93696.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0
93697.0,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,not at all,0,0


In [4]:
features = [
    #diabetes
    'DIQ010','DID040','DIQ160','DIQ170','DIQ172','DIQ175A','DIQ175B','DIQ175C',
    'DIQ175D','DIQ175E','DIQ175F','DIQ175G','DIQ175H','DIQ175I','DIQ175J','DIQ175K',
    'DIQ175L', 'DIQ175M','DIQ175N','DIQ175O','DIQ175P','DIQ175Q','DIQ175R','DIQ175S',
    'DIQ175T','DIQ175U','DIQ175V','DIQ175W','DIQ180','DIQ050','DID060','DIQ060U',
    'DIQ070','DIQ230','DIQ240','DID250','DID260','DIQ260U','DIQ275','DIQ280','DIQ291',
    'DIQ300S','DIQ300D','DID310S','DID310D','DID320','DID330','DID341','DID350',
    'DIQ350U','DIQ360','DIQ080', 
    #sleep disorder
    'SLQ050', 
    #physical activity
    'PAQ605','PAQ610','PAD615','PAQ620','PAQ625','PAD630','PAQ635','PAQ640','PAD645',
    'PAQ650','PAQ655','PAD660','PAQ665','PAQ670','PAD675','PAD680','PAQ706','PAQ710',
    'PAQ715', 
    #weight history
    'WHD010','WHD020','WHQ030','WHQ040','WHD050','WHQ060','WHQ070','WHD080A','WHD080B',
    'WHD080C','WHD080D','WHD080E','WHD080F','WHD080G','WHD080H','WHD080I','WHD080J',
    'WHD080K','WHD080M','WHD080N','WHD080O','WHD080P','WHD080Q','WHD080R','WHD080S',
    'WHD080T','WHD080L','WHD110','WHD120','WHD130','WHD140','WHQ150', 
    #early childhood
    'ECD010','ECQ020','ECD070A','ECD070B','ECQ080','ECQ090','WHQ030E','MCQ080E',
    'ECQ150', 
    #alcohol issues
    'ALQ101','ALQ110','ALQ120Q','ALQ120U','ALQ130','ALQ141Q','ALQ141U','ALQ151',
    #hospital access
    'HUQ010','HUQ020','HUQ030','HUQ071','HUD080','HUQ090', 
    #health status
    'HSD010','HSQ500','HSQ510','HSQ520','HSQ571','HSQ580','HSQ590','HSAQUEX',  
    #income
    #housing
    'HOD050','HOQ065', 
    #occupation
    'OCD150','OCQ180','OCQ210','OCQ260','OCD270','OCQ380','OCD390G','OCD395', 
    #mental health
    #demographic
    #diet nutrition
    'DBQ010','DBD030','DBD041','DBD050','DBD055','DBD061','DBQ073A','DBQ073B','DBQ073C',
    'DBQ073D','DBQ073E','DBQ073U','DBQ700','DBQ197','DBQ223A','DBQ223B','DBQ223C',
    'DBQ223D','DBQ223E','DBQ223U','DBQ229','DBQ235A','DBQ235B','DBQ235C','DBQ301',
    'DBQ330','DBQ360','DBQ370','DBD381','DBQ390','DBQ400','DBD411','DBQ421','DBQ424',
    'DBD895','DBD900','DBD905','DBD910', 
    #drug use
    'DUQ200','DUQ210','DUQ211','DUQ213','DUQ215Q','DUQ215U','DUQ217','DUQ219','DUQ220Q',
    'DUQ220U','DUQ230','DUQ240','DUQ250','DUQ260','DUQ270Q','DUQ270U','DUQ272','DUQ280',
    'DUQ290','DUQ300','DUQ310Q','DUQ310U','DUQ320','DUQ330','DUQ340','DUQ350Q','DUQ350U',
    'DUQ352','DUQ360','DUQ370','DUQ380A','DUQ380B','DUQ380C','DUQ380D','DUQ380E',
    'DUQ390','DUQ400Q','DUQ400U','DUQ410','DUQ420', 
    
]

len(features)

223

In [5]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [6]:
nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('enc', OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('red', PCA(n_components=10)), ('clf', GaussianNB())])

In [7]:
nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.7645128084586452


0.762133452070479

## Feature Importance

### Round 1

In [8]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)

In [9]:
feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

feature_importances.sort_values('importance_means', ascending = False)

Unnamed: 0,importance_means,importances_std
DBQ700,0.000129,0.000258
PAD645,0.000129,0.000258
DIQ080,0.000064,0.000193
WHD020,0.000064,0.000193
HSQ520,0.000064,0.000193
...,...,...
HSD010,-0.001870,0.000787
DIQ230,-0.001934,0.000000
HUQ010,-0.002128,0.000709
OCD150,-0.002192,0.000875


In [10]:
remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

192


['DIQ010',
 'DID040',
 'DIQ160',
 'DIQ172',
 'DIQ175A',
 'DIQ175B',
 'DIQ175C',
 'DIQ175D',
 'DIQ175E',
 'DIQ175F',
 'DIQ175G',
 'DIQ175H',
 'DIQ175I',
 'DIQ175J',
 'DIQ175K',
 'DIQ175L',
 'DIQ175M',
 'DIQ175N',
 'DIQ175O',
 'DIQ175P',
 'DIQ175Q',
 'DIQ175R',
 'DIQ175S',
 'DIQ175T',
 'DIQ175U',
 'DIQ175V',
 'DIQ175W',
 'DIQ180',
 'DIQ050',
 'DID060',
 'DIQ060U',
 'DIQ070',
 'DIQ230',
 'DIQ240',
 'DID250',
 'DID260',
 'DIQ260U',
 'DIQ275',
 'DIQ280',
 'DIQ291',
 'DIQ300S',
 'DIQ300D',
 'DID310S',
 'DID310D',
 'DID320',
 'DID330',
 'DID341',
 'DID350',
 'DIQ350U',
 'DIQ360',
 'SLQ050',
 'PAQ610',
 'PAD615',
 'PAD630',
 'PAQ635',
 'PAQ650',
 'PAQ655',
 'PAD660',
 'PAQ665',
 'PAQ670',
 'PAD680',
 'PAQ706',
 'PAQ710',
 'WHQ040',
 'WHQ060',
 'WHQ070',
 'WHD080A',
 'WHD080B',
 'WHD080C',
 'WHD080D',
 'WHD080E',
 'WHD080F',
 'WHD080G',
 'WHD080H',
 'WHD080I',
 'WHD080J',
 'WHD080K',
 'WHD080M',
 'WHD080N',
 'WHD080O',
 'WHD080P',
 'WHD080Q',
 'WHD080R',
 'WHD080S',
 'WHD080T',
 'WHD080L',
 'WH

In [11]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

31


['DIQ170',
 'DIQ080',
 'PAQ605',
 'PAQ620',
 'PAQ625',
 'PAQ640',
 'PAD645',
 'PAD675',
 'PAQ715',
 'WHD010',
 'WHD020',
 'WHQ030',
 'WHD050',
 'WHD110',
 'WHD140',
 'ALQ101',
 'ALQ110',
 'ALQ120Q',
 'HUQ020',
 'HSQ500',
 'HSQ510',
 'HSQ520',
 'HOD050',
 'HOQ065',
 'OCQ260',
 'OCD390G',
 'DBQ700',
 'DBQ197',
 'DBQ235B',
 'DBD895',
 'DBD910']

In [12]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.709253251157018


0.6926540478159331

### Round 2

In [13]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

28


['DIQ170',
 'DIQ080',
 'PAQ605',
 'PAQ620',
 'PAQ625',
 'PAQ640',
 'PAD645',
 'PAD675',
 'PAQ715',
 'WHD010',
 'WHD020',
 'WHQ030',
 'WHD050',
 'WHD110',
 'WHD140',
 'ALQ101',
 'ALQ110',
 'ALQ120Q',
 'HSQ500',
 'HSQ510',
 'HSQ520',
 'HOD050',
 'HOQ065',
 'OCQ260',
 'DBQ197',
 'DBQ235B',
 'DBD895',
 'DBD910']

In [None]:
print(features)

In [14]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

3


['HUQ020', 'OCD390G', 'DBQ700']

In [None]:
# testing python to see if remove works properly
l = ['hello', 'hi', 'howdy']
print(l)
r = ['hello']
print(r)

for i in r:
    l.remove(i)

print(l)

In [15]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.7055753605927085


0.7074409116104938

### Round 3

In [16]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

1


['OCD390G']

In [17]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

2


['HUQ020', 'DBQ700']

In [18]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.6827450551602064


0.6887784310340127

### Round 4

In [19]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

1


['HUQ020']

In [20]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

1


['DBQ700']

In [23]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=5)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.6469248647159435


0.6493566385271748

### Round 5

In [24]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

0


[]

In [25]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

1


['DBQ700']

In [27]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=6)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.6469248647159435


0.6493566385271748

### Round 6

In [28]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

0


[]

In [29]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

1


['DBQ700']

In [31]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=5)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

0.6469248647159435


0.6493566385271748

## Not in Use

### Round 7

In [None]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

In [None]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

In [None]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

### Round 8

In [None]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

In [None]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

In [None]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

### Round 9

In [None]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

In [None]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

In [None]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

### Round 10

In [None]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

In [None]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

In [None]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

### Round 11

In [None]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

In [None]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

In [None]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score

### Round 12

In [None]:
r = permutation_importance(
    nb_pipe, 
    X_val, 
    y_val,
    n_repeats=10,
    n_jobs=-1,
    random_state=42
)


feature_importances = pd.DataFrame.from_dict(
    {
        'importance_means': r['importances_mean'],
        'importances_std': r['importances_std']
    }, orient='columns'
)
feature_importances.index = X_val.columns

#feature_importances.sort_values('importance_means', ascending = False)


remove = []

for feature in feature_importances[
    (feature_importances['importance_means'] <= 0) & 
    (abs(feature_importances['importance_means']) >= feature_importances['importances_std'])
].index:
    # print(feature)
    remove.append(feature)

print(len(remove))
remove

In [None]:
for feature in remove:
    features.remove(feature)

print(len(features))
features

In [None]:
X = all_data_df[features]
y = mental_health_df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


nb_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder(sparse=False, handle_unknown='ignore')), 
    ('red', PCA(n_components=10)),
    ('clf', GaussianNB())
])
nb_pipe.fit(X_train, y_train)


nb_training_score = roc_auc_score(y_train.values, nb_pipe.predict_proba(X_train)[:, 1])
nb_validation_score = roc_auc_score(y_val.values, nb_pipe.predict_proba(X_val)[:, 1])
print(nb_training_score)
nb_validation_score