In [41]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
np.random.seed(2017) # set random seed value to get reproducible results
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
from sklearn.utils import resample
import time

In [42]:
# store start time to get execution time of entire script
start_time = time.time()

In [43]:
grouping = 'stimulants'
#grouping = 'opioids'

df = pd.read_csv(grouping + '.csv', index_col=[0])
print(df.shape)
df.head()

(4603, 32)


Unnamed: 0,engage30,init,female_cd,nonwhite_cd,unemplmt_cd,prsatx_cd,TRIg_0_cd,TMIg_0_cd,SESg_0_cd,gvsg_cd,tsd_0_cd,und15_cd,CWSg_0_cd,srprobg_cd,dldiag_cd,dssg_0_cd,epsg_0_cd,adhdg_0_cd,cdsg_0_cd,suicprbs_0_cd,cjsig_0_cd,lrig_0_cd,srig_0_cd,homeless_0_cd,S6_cd,gcsg_0_cd,ncar_cd,SFSg_0_cd,Raceg4_cd_gr_1,Raceg4_cd_gr_2,Raceg4_cd_gr_3,Raceg4_cd_gr_4
0,0,0,0,0,1,1,0,1,0,2,1,1,0,1,1,2,1,2,1,0,1,1,1,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,1,0,1,2,1,1,0,1,1,1,1,0,0,0
9,1,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0
11,0,0,0,0,0,0,1,1,0,2,0,1,0,1,1,1,2,2,1,0,2,1,1,0,0,0,0,0,1,0,0,0
16,0,0,0,0,0,0,0,1,0,2,1,1,0,1,1,2,1,2,2,0,2,2,2,1,1,2,1,0,1,0,0,0


In [44]:
data = df.drop(columns=['init', 'engage30'])

engage30 = df['engage30']
init = df['init']

In [None]:
%%time

outcome = 'engage30'
X_train, X_test, y_train, y_test = train_test_split(data, engage30, test_size=0.25, random_state=2017)

# combine them back for resampling
train_data = pd.concat([X_train, y_train], axis=1)
# separate minority and majority classes
negative = train_data[train_data[outcome] == 0]
positive = train_data[train_data[outcome] == 1]

# upsample minority
pos_upsampled = resample(positive, 
                           replace=True, # sample with replacement
                           n_samples=len(negative)) # match number in minority class

# combine majority and upsampled minority
upsampled = pd.concat([negative, pos_upsampled])
X_train = upsampled.drop(columns=[outcome])
y_train = upsampled[outcome]

model = RandomForestClassifier(random_state=2017)
sfs = SequentialFeatureSelector(model, k_features=X_train.shape[1], forward=True, cv=5, verbose=2, scoring='roc_auc')
sfs.fit(X_train, y_train)

fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')

plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   33.5s finished

[2020-12-04 10:57:54] Features: 1/30 -- score: 0.5592174865470839[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


In [None]:
# model = RandomForestClassifier(random_state=2017)
# sfs = SequentialFeatureSelector(model, k_features=16, forward=True, cv=5, scoring='roc_auc')
# sfs.fit(X_train, y_train)

# sfs.k_feature_names_ # test to check order!!!

In [None]:
%%time

outcome = 'init'
X_train, X_test, y_train, y_test = train_test_split(data, init, test_size=0.25, random_state=2017)

# combine them back for resampling
train_data = pd.concat([X_train, y_train], axis=1)
# separate minority and majority classes
negative = train_data[train_data[outcome] == 0]
positive = train_data[train_data[outcome] == 1]

# upsample minority
pos_upsampled = resample(positive, 
                           replace=True, # sample with replacement
                           n_samples=len(negative)) # match number in minority class

# combine majority and upsampled minority
upsampled = pd.concat([negative, pos_upsampled])
X_train = upsampled.drop(columns=[outcome])
y_train = upsampled[outcome]

model = RandomForestClassifier(random_state=2017)
sfs = SequentialFeatureSelector(model, k_features=X_train.shape[1], forward=True, cv=5, verbose=2, scoring='roc_auc')
sfs.fit(X_train, y_train)

fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')

plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()

In [None]:
# model = RandomForestClassifier(random_state=2017)
# sfs = SequentialFeatureSelector(model, k_features=12, forward=True, cv=5, scoring='roc_auc')
# sfs.fit(X_train, y_train)

# sfs.k_feature_names_ # test to check order!!!