In [1]:
# store start time to get execution time of entire script
import time
start_time = time.time()

In [2]:
import numpy as np
np.random.seed(2017) # set random seed value to get reproducible results

In [3]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import csv

df = pd.read_csv('../data/data_superset.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,ID,State,City,agyaddr,xobsyr_0,Illicit_Days,Illicit_Cens,female,nonwhite,unemplmt_cd,prsatx_cd,gvsg_cd,CWSg_0_cd,dssg_0_cd,epsg_0_cd,adhdg_0_cd,cdsg_0_cd,cjsig_0_cd,lrig_0_cd,srig_0_cd,SESg_0_cd,r4ag_0_cd,SUDSy_0_cd,homeless_0_cd,ncar_cd,TRIg_0_cd,PYS9Sxg_cd,primsev_other,primsev_alcohol,primsev_amphetamines,primsev_cocaine,primsev_marijuana,primsev_opioids,B2a_0g,Address,lat,lng,state_name,county_FIPS,block_FIPS,murder_numg,%_dropoutg,%_povertyg,%_public_assistanceg,%_unemployedg,closest,gran,point,pop_deng
0,0,4,6,21058,ID,Boise,8620 W. Emerald Street,2011,365,0,0,1,0,1,2,0,2,1,2,1,2,2,2,1,1,3,1,1,0,2,0,0,1,0,0,0,2,"8620 W. Emerald Street , B...",43.611373,-116.288882,Idaho,16001.0,160010000000000.0,0,0.0,0.0,0.0,0.0,"('39.523981', '-119.787921')",2.0,"('43.611373', '-116.288882')",0.0
1,1,5,7,12534,MI,Detroit,"1025 E. Forrest, Rm 408",2009,203,0,0,1,0,0,2,0,0,0,1,2,1,2,2,1,2,3,0,1,2,0,0,0,0,0,1,0,1,"1025 E. Forrest, Rm 408 , D...",42.359198,-83.051939,Michigan,26163.0,261635200000000.0,2,0.0,0.0,0.0,0.0,,0.0,"('42.359198', '-83.051939')",0.0
2,2,7,9,14164,CO,Thornton,8801 Lipan Street,2008,96,1,1,1,1,1,2,0,1,2,1,1,1,2,1,0,2,3,0,1,0,0,0,1,0,0,0,0,1,"8801 Lipan Street , T...",39.858039,-105.001042,Colorado,8001.0,80010090000000.0,0,0.0,0.0,0.0,0.0,,1.0,"('39.858039', '-105.001042')",0.0
3,3,8,10,18612,FL,Clearwater,315 Court Street,2009,51,1,1,0,1,0,1,0,1,1,0,0,0,2,2,0,2,1,0,1,0,0,0,1,0,0,0,0,2,"315 Court Street , C...",27.961606,-82.801244,Florida,12103.0,121030300000000.0,0,0.0,0.0,0.0,0.0,,0.0,"('27.9616065', '-82.80124359999999')",0.0
4,4,10,13,23871,WA,Seattle,401 Fifth Avenue,2012,213,0,1,0,0,1,2,0,2,1,0,0,2,2,2,1,2,3,0,0,1,0,0,0,1,0,0,0,1,"401 Fifth Avenue , S...",47.601714,-122.327658,Washington,53033.0,530330100000000.0,0,0.0,0.0,0.0,0.0,,0.0,"('47.601714', '-122.32765800000001')",0.0


In [4]:
# drop unnecessary columns
cols_to_drop = ['Address','lat','lng','xobsyr_0','Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1',
                'ID','State','City','agyaddr','state_name','gran','county_FIPS','block_FIPS',
                'point','closest']

df.drop(columns=cols_to_drop, inplace=True)
df.dropna(inplace=True) # drops any remaining rows with null values

# uncomment to get CONTROL statistics
#cols_to_drop = ['pop_deng','%_dropoutg','%_unemployedg','%_public_assistanceg','%_povertyg','murder_numg']
#df.drop(columns=cols_to_drop, inplace=True)

df = df.astype(int)
df = df.sample(frac=1).reset_index(drop=True) # shuffle rows
df.shape

(10683, 35)

In [5]:
from sksurv.util import Surv

predictor_var = 'Illicit_Days'
censoring_var = 'Illicit_Cens'

X = df.copy()
Y = X[[censoring_var, predictor_var]]
X.drop(columns=[censoring_var, predictor_var], inplace=True)
y = Surv.from_arrays(Y[censoring_var], Y[predictor_var]) # structured array to ensure correct censoring

print(X.shape, y.shape)

(10683, 33) (10683,)


In [6]:
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
plt.rcParams["font.weight"] = "bold"
plt.rcParams["font.size"] = 14

def forward_feature_selection(rsf, X, y):
    features = list(X.columns)
    selected_features = []
    scores = []
    
    for i in tqdm(range(X.shape[1])):
        best_score = 0
        next_feat = ''
        for feat in features:
            selected_features.append(feat)
            temp_X = X[selected_features]
            temp_scores = cross_validate(rsf, temp_X, y, cv=5)
            temp_score = temp_scores['test_score'].mean()
            if temp_score > best_score:
                score = temp_score
                next_feat = feat
            selected_features.pop()
        #print('Added Feature:', next_feat)
        selected_features.append(next_feat)
        features.remove(next_feat)
        scores.append(score)
        
    print('Ordering of Features:', selected_features)
    
    plt.xlabel('Feature')
    plt.ylabel('score')
    plt.plot(selected_features, scores)

  return f(*args, **kwds)


In [None]:
rsf = RandomSurvivalForest()
forward_feature_selection(rsf, X, y)

HBox(children=(IntProgress(value=0, max=33), HTML(value='')))

In [None]:
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
plt.rcParams["font.weight"] = "bold"
plt.rcParams["font.size"] = 14

def backward_feature_selection(rsf, X, y):
    features = list(X.columns)
    removed_features = []
    scores = []
    
    for i in tqdm(range(X.shape[1])):
        best_score = 0
        next_feat = ''
        for feat in features:
            removed_features.append(feat)
            temp_X = X.drop(columns=removed_features, inplace=False)
            temp_scores = cross_validate(rsf, temp_X, y, cv=5)
            temp_score = temp_scores['test_score'].mean()
            if temp_score > best_score:
                score = temp_score
                next_feat = feat
            removed_features.pop()
        #print('Removed Feature:', next_feat)
        removed_features.append(next_feat)
        features.remove(next_feat)
        scores.append(score)
        
    print('Ordering of Features:', selected_features)
    
    plt.xlabel('Feature')
    plt.ylabel('score')
    plt.plot(selected_features, scores)

In [None]:
rsf = RandomSurvivalForest()
backward_feature_selection(rsf, X, y)

In [None]:
# print out total notebook execution time
total_seconds = int(time.time() - start_time)
minutes = total_seconds // 60
seconds = total_seconds % 60
print("--- " + str(minutes) + " minutes " + str(seconds) + " seconds ---")