In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import glob
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import RandomizedSearchCV
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import sys
from sklearn.metrics import precision_recall_curve, recall_score, log_loss

In [11]:
train = pd.read_csv('../data/Processed/train.csv')
X_train = train.loc[:,[c for c in train.columns if c != 'patient_id']]
y_train = pd.read_csv('../data/raw/train_labels.csv', usecols=['heart_disease_present'])
test = pd.read_csv('../data/Processed/test.csv')
X_test = test.loc[:,[c for c in test.columns if c != 'patient_id']]

In [12]:
train.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,...,age,max_heart_rate_achieved,exercise_induced_angina,count0,count1,count2,count3,count4,presure_p_rate,cholesterol_p_age
0,0z64un,0.0,0.204082,0.395349,0.333333,0.0,0.0,1.0,0.415525,0.0,...,0.333333,0.698113,0.0,1.0,0.75,1.0,0.508475,0.333333,0.573669,0.249247
1,ryoo3j,0.5,0.204082,0.186047,0.666667,0.0,0.0,0.0,0.200913,0.258065,...,0.520833,0.584906,0.0,0.537634,0.680556,0.597938,0.864407,0.416667,0.328291,0.176958
2,yt1s1x,0.0,0.204082,0.360465,1.0,1.0,0.0,1.0,0.406393,0.0,...,1.0,0.622642,1.0,1.0,0.75,0.175258,0.186441,0.0,0.489076,0.54882
3,l2xjde,0.0,0.756757,0.674419,1.0,0.0,0.0,0.0,0.221461,0.0,...,0.229167,0.801887,0.0,0.354839,1.0,1.0,1.0,0.166667,0.89591,0.094252
4,oyt4ek,1.0,0.756757,0.976744,0.0,0.0,0.0,1.0,0.328767,0.677419,...,0.625,0.462264,0.0,0.096774,0.138889,0.020619,0.20339,0.0,0.80056,0.314194


In [13]:
# sel_cols = ['count0', 'count1', 'count2', 'count3', 'count4', 'presure_p_rate', 'cholesterol_p_age']
# X_train = X_train[sel_cols]
# X_test = X_test[sel_cols]

# Random Forest

In [14]:
model_name = 'rf_1'

params = {
    'n_estimators': 1000,
    'criterion': 'entropy',
    'max_depth': 5,
    'random_state': 42,
    'n_jobs': 1,
    'verbose': 0,
    'min_samples_leaf': 2,
    
}

rf = RandomForestClassifier(**params)

In [15]:
lb = LabelBinarizer()

In [16]:
train_ids = X_train.index
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

counter = 1
y_preds = np.zeros(X_test.shape[0])
be = 0
log_loss_score = 0
stack = pd.DataFrame()
stack2 = pd.DataFrame()
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    rf.fit(X_fit, y_fit)
    
    y_probas = rf.predict_proba(X_val)
    y_preds += rf.predict_proba(X_test)[:,1]
    stack2['patient_id'] = train['patient_id'][test_index].tolist()
    stack2['heart_disease_present'] = rf.predict_proba(X_val)[:,1]
    stack = pd.concat([stack, stack2], axis = 0)
    be += recall_score(y_val['heart_disease_present'], rf.predict(X_val))
    
    y_val = lb.fit_transform(y_val)
    log_loss_score += log_loss(y_val, y_probas)

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

#     print('Guardamos el modelo')
#     joblib.dump(lgb_model, '../saved_models/{}_{}.pkl'.format(model_name, counter))

#     ft_importances += lgb_model.feature_importances_

    counter += 1

y_preds = y_preds / (counter-1)

print('\n\nBEST RECALL SCORE MEAN:', be / (counter-1))
print('\n\nBEST LOG_LOSS SCORE MEAN:', log_loss_score / (counter-1))

Fold 1



  app.launch_new_instance()


Fold 2



  app.launch_new_instance()


Fold 3



  app.launch_new_instance()


Fold 4



  app.launch_new_instance()


Fold 5



  app.launch_new_instance()




BEST RECALL SCORE MEAN: 0.775


BEST LOG_LOSS SCORE MEAN: 0.4389428634566654


In [17]:
stack.head()

Unnamed: 0,patient_id,heart_disease_present
0,0z64un,0.092259
1,yt1s1x,0.554173
2,3nwy2n,0.807424
3,1r508r,0.062726
4,cvux3j,0.183188


In [18]:
stack.to_csv('../stacking/data/train_{}.csv'.format(model_name), index=False)

In [19]:
y_preds[:5]

array([0.45883862, 0.13302086, 0.87010754, 0.23769092, 0.78363569])

In [20]:
sub = pd.DataFrame({'patient_id': test['patient_id'], 'heart_disease_present': y_preds})

In [21]:
sub.to_csv('../stacking/data/test_{}.csv'.format(model_name), index=False)