In [127]:
# import libraries 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import warnings
warnings.filterwarnings("ignore")

In [128]:
# load data 
X_train = pd.read_csv('X_train.csv')
X_val = pd.read_csv('X_val.csv')
X_test = pd.read_csv('X_test.csv')

X_train = X_train.drop(['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2', 'diag_3'], axis = 1)
X_val = X_val.drop(['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2', 'diag_3'], axis = 1)
X_test = X_test.drop(['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2', 'diag_3'], axis = 1)


y_train = pd.read_csv('y_train.csv')
y_val = pd.read_csv('y_val.csv')
y_test = pd.read_csv('y_test.csv')

y_train = np.ravel(y_train)
y_val = np.ravel(y_val)
y_test = np.ravel(y_test)

In [129]:
# create RandomForest model with default parameters for baseline performance 
# import functions 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import f1_score, balanced_accuracy_score, recall_score, precision_score 

# initialize RandomForest model 
rfc_baseline = RandomForestClassifier()

# fit model to training data 
rfc_baseline.fit(X_train, y_train)

# predict on validation and test set 
y_pred_val = rfc_baseline.predict(X_val)
y_pred_test = rfc_baseline.predict(X_test)

# evaluate on validation and test set 
print('Validation balanced accuracy:', balanced_accuracy_score(y_val, y_pred_val))
print('Validation recall:', recall_score(y_val, y_pred_val))
print('Validaton precision:', precision_score(y_val, y_pred_val))
print('Validation f1:', f1_score(y_val, y_pred_val))

print()

print('Test balanced accuracy:', balanced_accuracy_score(y_test, y_pred_test))
print('Test recall:', recall_score(y_test, y_pred_test))
print('Test precision:', precision_score(y_test, y_pred_test))
print('Test f1:', f1_score(y_test, y_pred_test))

Validation balanced accuracy: 0.6033963190167659
Validation recall: 0.5109275053304904
Validaton precision: 0.5894833948339483
Validation f1: 0.547401484865791

Test balanced accuracy: 0.6014136869351274
Test recall: 0.5073019933908965
Test precision: 0.5875308641975309
Test f1: 0.5444768605914994


In [116]:
from boruta import BorutaPy

feat_selector = BorutaPy(
    estimator = RandomForestClassifier(), 
    n_estimators = 'auto', 
    max_iter = 10, 
    verbose = 2)

feat_selector.fit(np.array(X_train), np.array(y_train))

Iteration: 	1 / 10
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	2 / 10
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	3 / 10
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	4 / 10
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	5 / 10
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	6 / 10
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	7 / 10
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	8 / 10
Confirmed: 	0
Tentative: 	4
Rejected: 	38
Iteration: 	9 / 10
Confirmed: 	3
Tentative: 	1
Rejected: 	38


BorutaPy finished running.

Iteration: 	10 / 10
Confirmed: 	3
Tentative: 	1
Rejected: 	38


BorutaPy(estimator=RandomForestClassifier(n_estimators=28,
                                          random_state=RandomState(MT19937) at 0x104E41440),
         max_iter=10, n_estimators='auto',
         random_state=RandomState(MT19937) at 0x104E41440, verbose=2)

In [117]:
print('-----Feature Selection-----') 
for i in range(len(feat_selector.support_)): 
    if feat_selector.support_[i]:
        print('Passes the test:', X_train.columns[i], 
             '- Ranking:', feat_selector.ranking_[i])
    else: 
        print('Does not pass the test:', X_train.columns[i], '- Ranking:', feat_selector.ranking_[i])

-----Feature Selection-----
Does not pass the test: age - Ranking: 7
Does not pass the test: race - Ranking: 15
Does not pass the test: gender - Ranking: 17
Does not pass the test: admission_type_id - Ranking: 11
Does not pass the test: discharge_disposition_id - Ranking: 5
Does not pass the test: admission_source_id - Ranking: 10
Passes the test: diag_1 - Ranking: 1
Passes the test: diag_2 - Ranking: 1
Passes the test: diag_3 - Ranking: 1
Does not pass the test: max_glu_serum - Ranking: 24
Does not pass the test: A1Cresult - Ranking: 16
Does not pass the test: metformin - Ranking: 18
Does not pass the test: repaglinide - Ranking: 27
Does not pass the test: nateglinide - Ranking: 29
Does not pass the test: chlorpropamide - Ranking: 31
Does not pass the test: glimepiride - Ranking: 26
Does not pass the test: acetohexamide - Ranking: 39
Does not pass the test: glipizide - Ranking: 19
Does not pass the test: glyburide - Ranking: 21
Does not pass the test: tolbutamide - Ranking: 34
Does no

In [120]:
cols_to_keep = []
for i, rank in enumerate(feat_selector.ranking_): 
    if rank < 15: 
        cols_to_keep.append(X_train.columns[i])

In [121]:
X_train_filtered = X_train[cols_to_keep]
X_val_filtered = X_val[cols_to_keep]

# initialize RandomForest model 
rfc_baseline = RandomForestClassifier()

# fit model to training data 
rfc_baseline.fit(X_train_filtered, y_train)

# predict on validation and test set 
y_pred_val = rfc_baseline.predict(X_val_filtered)

# evaluate on validation and test set 
print('Validation balanced accuracy:', balanced_accuracy_score(y_val, y_pred_val))
print('Validation recall:', recall_score(y_val, y_pred_val))
print('Validaton precision:', precision_score(y_val, y_pred_val))
print('Validation f1:', f1_score(y_val, y_pred_val))

Validation balanced accuracy: 0.6254039460080036
Validation recall: 0.5551705756929638
Validaton precision: 0.6092424685580579
Validation f1: 0.580951052851764


In [122]:
import random

val_scores = []
params = []
for i in range(200): 
    
    print(i, end = '->')
    
    p = {
        'max_depth': random.randint(3, 25), 
        'max_features': random.uniform(.4, .9), 
        'max_samples': random.uniform(.4, .9),
        'n_estimators': random.randint(10, 150)
    }
    
    model = RandomForestClassifier(n_estimators = p['n_estimators'], 
                                max_depth = p['max_depth'], 
                                max_features = p['max_features'], 
                                max_samples = p['max_samples'])
    
    model.fit(X_train_filtered, y_train)
    
    y_pred_val = model.predict(X_val_filtered)
    
    params.append(p)
    val_scores.append(f1_score(y_val, y_pred_val))

0->1->2->3->4->5->6->7->8->9->10->11->12->13->14->15->16->17->18->19->20->21->22->23->24->25->26->27->28->29->30->31->32->33->34->35->36->37->38->39->40->41->42->43->44->45->46->47->48->49->50->51->52->53->54->55->56->57->58->59->60->

KeyboardInterrupt: 

In [112]:
max(val_scores)

0.5171397549247713

In [80]:
param_grid = {
    'max_depth': [15, 18, 21, 24], 
    'max_features': [0.6, 0.65, 0.7]
}

search_grid = ParameterGrid(param_grid) 

val_scores_new = []
for x in search_grid: 
    
    model = RandomForestClassifier(max_depth = x['max_depth'], max_features = x['max_features'])
    
    model.fit(X_train_filtered, y_train)
    
    y_pred_val = model.predict(X_val_filtered)
    
    val_scores_new.append(f1_score(y_val, y_pred_val, average = 'macro'))

In [96]:
max(val_scores)

0.38632224967918494

In [82]:
search_grid[np.argmax(val_scores_new)]

{'max_features': 0.7, 'max_depth': 21}

In [84]:
X_dev = pd.read_csv('X_dev_encoded_final.csv')
X_test_final = pd.read_csv('X_test_encoded_final.csv')

X_dev_filtered = X_dev[cols_to_keep]
X_test_final_filtered = X_test_final[cols_to_keep]

y_dev = pd.read_csv('y_dev_encoded_final.csv')
y_test_final = pd.read_csv('y_test_encoded_final.csv')

In [87]:
# initialize RandomForest model 
rfc_final = RandomForestClassifier(max_features = 0.7, max_depth = 21)

# fit model to training data 
rfc_final.fit(X_dev_filtered, y_dev)

# predict on validation and test set 
y_pred_test = rfc_final.predict(X_test_final_filtered)

# evaluate on validation and test set 
print('Validation balanced accuracy:', balanced_accuracy_score(y_test_final, y_pred_test))
print('Validation macro recall:', recall_score(y_test_final, y_pred_test, average = 'macro'))
print('Validaton macro precision:', precision_score(y_test_final, y_pred_test, average = 'macro'))
print('Validation macro f1:', f1_score(y_test_final, y_pred_test, average = 'macro'))

Validation balanced accuracy: 0.40799575740560473
Validation macro recall: 0.40799575740560473
Validaton macro precision: 0.47407378111887083
Validation macro f1: 0.39582281481538445


In [98]:
cols_to_keep

['age',
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_inpatient',
 'number_diagnoses',
 'admission_type_id_1',
 'discharge_disposition_id_1',
 'diag_1_1',
 'diag_2_1',
 'diag_3_1',
 'admission_type_id_2',
 'discharge_disposition_id_2',
 'admission_source_id_2',
 'diag_1_2',
 'diag_2_2',
 'diag_3_2']