In [4]:
# store start time to get execution time of entire script
import time
start_time = time.time()

In [5]:
import numpy as np
np.random.seed(2017) # set random seed value to get reproducible results

In [6]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import csv

df = pd.read_csv('../data/data_superset.csv')
df.head()

FileNotFoundError: [Errno 2] File b'../data/data_superset.csv' does not exist: b'../data/data_superset.csv'

In [None]:
# drop unnecessary columns
cols_to_drop = ['Address','lat','lng','xobsyr_0','Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1',
                'ID','State','City','agyaddr','state_name','gran','county_FIPS','block_FIPS',
                'point','closest']

df.drop(columns=cols_to_drop, inplace=True)
df.dropna(inplace=True) # drops any remaining rows with null values

# uncomment to get CONTROL statistics
#cols_to_drop = ['pop_deng','%_dropoutg','%_unemployedg','%_public_assistanceg','%_povertyg','murder_numg']
#df.drop(columns=cols_to_drop, inplace=True)

df = df.astype(int)
df = df.sample(frac=1).reset_index(drop=True) # shuffle rows
df.shape

In [None]:
from sksurv.util import Surv

predictor_var = 'Illicit_Days'
censoring_var = 'Illicit_Cens'

X = df.copy()
Y = X[[censoring_var, predictor_var]]
X.drop(columns=[censoring_var, predictor_var], inplace=True)
y = Surv.from_arrays(Y[censoring_var], Y[predictor_var]) # structured array to ensure correct censoring

print(X.shape, y.shape)

In [None]:
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
plt.rcParams["font.weight"] = "bold"
plt.rcParams["font.size"] = 14

def forward_feature_selection(rsf, X, y):
    features = list(X.columns)
    selected_features = []
    scores = []
    
    for i in tqdm(range(X.shape[1])):
        best_score = 0
        next_feat = ''
        for feat in features:
            selected_features.append(feat)
            temp_X = X[selected_features]
            temp_scores = cross_validate(rsf, temp_X, y, cv=5)
            temp_score = temp_scores['test_score'].mean()
            if temp_score > best_score:
                score = temp_score
                next_feat = feat
            selected_features.pop()
        #print('Added Feature:', next_feat)
        selected_features.append(next_feat)
        features.remove(next_feat)
        scores.append(score)
        
    print('Ordering of Features:', selected_features)
    
    plt.xlabel('Feature')
    plt.ylabel('score')
    plt.plot(selected_features, scores)

In [None]:
rsf = RandomSurvivalForest()
forward_feature_selection(rsf, X, y)

In [None]:
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
plt.rcParams["font.weight"] = "bold"
plt.rcParams["font.size"] = 14

def backward_feature_selection(rsf, X, y):
    features = list(X.columns)
    removed_features = []
    scores = []
    
    for i in tqdm(range(X.shape[1])):
        best_score = 0
        next_feat = ''
        for feat in features:
            removed_features.append(feat)
            temp_X = X.drop(columns=removed_features, inplace=False)
            temp_scores = cross_validate(rsf, temp_X, y, cv=5)
            temp_score = temp_scores['test_score'].mean()
            if temp_score > best_score:
                score = temp_score
                next_feat = feat
            removed_features.pop()
        #print('Removed Feature:', next_feat)
        removed_features.append(next_feat)
        features.remove(next_feat)
        scores.append(score)
        
    print('Ordering of Features:', selected_features)
    
    plt.xlabel('Feature')
    plt.ylabel('score')
    plt.plot(selected_features, scores)

In [None]:
rsf = RandomSurvivalForest()
backward_feature_selection(rsf, X, y)

In [None]:
# print out total notebook execution time
total_seconds = int(time.time() - start_time)
minutes = total_seconds // 60
seconds = total_seconds % 60
print("--- " + str(minutes) + " minutes " + str(seconds) + " seconds ---")