In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import glob
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelBinarizer
import lightgbm as lgb
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import sys
from eli5 import show_weights, show_prediction
from eli5.sklearn import PermutationImportance
from eli5.permutation_importance import get_score_importances
from sklearn.metrics import precision_recall_curve, recall_score, log_loss

In [2]:
train = pd.read_csv('../data/Processed/train_2.csv')
X_train = train.loc[:,[c for c in train.columns if c != 'patient_id']]
y_train = pd.read_csv('../data/raw/train_labels.csv', usecols=['heart_disease_present'])
test = pd.read_csv('../data/Processed/test_2.csv')
X_test = test.loc[:,[c for c in test.columns if c != 'patient_id']]

In [3]:
train.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,...,ica0,ica1,ica2,ica3,ica4,svd0,svd1,svd2,svd3,svd4
0,0z64un,0.0,0.204082,0.395349,0.333333,0.0,0.0,1.0,0.415525,0.0,...,-0.029118,-0.028298,0.081133,-0.008523,0.034247,1.998694,-0.349453,0.127289,0.189645,0.120264
1,ryoo3j,0.5,0.204082,0.186047,0.666667,0.0,0.0,0.0,0.200913,0.258065,...,-0.020315,-0.01461,-0.048453,0.070472,0.046844,1.621227,-0.101847,-0.385119,-0.096236,-0.047911
2,yt1s1x,0.0,0.204082,0.360465,1.0,1.0,0.0,1.0,0.406393,0.0,...,-0.149001,0.082324,0.045454,-0.090831,-0.090201,1.703174,0.432918,0.512625,-0.180056,0.376107
3,l2xjde,0.0,0.756757,0.674419,1.0,0.0,0.0,0.0,0.221461,0.0,...,0.21569,0.020249,0.066368,0.059257,0.089806,2.106877,0.082825,-0.398068,0.57339,-0.388193
4,oyt4ek,1.0,0.756757,0.976744,0.0,0.0,0.0,1.0,0.328767,0.677419,...,-0.033449,0.254999,0.03285,-0.074739,0.000597,1.045223,0.909018,0.098206,0.508642,0.461912


# Random Forest

In [4]:
model_name = 'rf_1'

params = {
    'n_estimators': 1000,
    'criterion': 'entropy',
    'max_depth': 10,
    'random_state': 42,
    'n_jobs': 1,
    'verbose': 0,
    'min_samples_leaf': 2,
    
}

rf = RandomForestClassifier(**params)

In [10]:
train_ids = X_train.index
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

In [11]:
counter = 1
y_preds = np.zeros(X_test.shape[0])
be = 0
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    rf.fit(X_fit,
                  y_fit,
#                   eval_set=[(X_val, y_val)],
#                   verbose=100,
#                   early_stopping_rounds=20
          )
    
    y_preds += rf.predict_proba(X_test)[:,1]
    be += recall_score(y_val['heart_disease_present'], rf.predict(X_val))

#     be += rf.best_score_['valid_0']['binary_logloss']

#     show_prediction(rf, X_val.iloc[1,:], show_feature_values=True)

#     del X_fit
#     del X_val
#     del y_fit
#     del y_val
#     del train_index
#     del test_index
#     gc.collect()

#     print('Guardamos el modelo')
#     joblib.dump(lgb_model, '../saved_models/{}_{}.pkl'.format(model_name, counter))

#     ft_importances += lgb_model.feature_importances_

    counter += 1

y_preds = y_preds / (counter-1)

print('\n\nBEST SCORE MEAN:', be / (counter-1))


Fold 1



  # This is added back by InteractiveShellApp.init_path()


Fold 2



  # This is added back by InteractiveShellApp.init_path()




BEST SCORE MEAN: 0.8


In [12]:
recall_score(y_val['heart_disease_present'], rf.predict(X_val))

0.825

In [14]:
# %%time
rf_refit = rf.fit(X_fit, y_fit)
perm = PermutationImportance(rf_refit).fit(X_val, y_val)
show_weights(perm, feature_names=X_val.columns.tolist())

  


Weight,Feature
0.0489  ± 0.0752,thal
0.0111  ± 0.0243,count1
0.0089  ± 0.0166,num_major_vessels
0.0022  ± 0.0218,svd0
0  ± 0.0000,exercise_induced_angina
0  ± 0.0000,resting_blood_pressure
0  ± 0.0000,count3
0  ± 0.0000,ica3
0  ± 0.0000,svd3
0  ± 0.0000,ica0


In [22]:
ft_sel = ['thal', 'count1', 'num_major_vessels', 'svd0']

In [23]:
ft_sel

['pca0', 'pca1', 'pca2', 'pca3', 'pca4']

In [24]:
X_train_sel = X_train[ft_sel]
X_test_sel = X_test[ft_sel]

# LightGBM

In [25]:
model_name = 'lgbm_4'

params = {'max_depth':5,
          'metric':'binary_logloss',
          'n_estimators':10000,
          'learning_rate':0.03,
          'colsample_bytree':1,
          'objective':'binary',
          'n_jobs': 1,
          'seed':42,
          'bagging_fraction':1,
          'lambda_l1':0,
          'lambda_l2':0,
         'seed': 42}


lgb_model = lgb.LGBMClassifier(**params)

In [26]:
train_ids = X_train_sel.index
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [27]:
counter = 1
y_preds = np.zeros(X_test_sel.shape[0])
be = 0
recall = 0
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter))

    X_fit, X_val = X_train_sel.iloc[train_index, :], X_train_sel.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val)],
                  verbose=100,
                  early_stopping_rounds=20)
    
    y_preds += lgb_model.predict_proba(X_test_sel)[:,1]
    
    be += lgb_model.best_score_['valid_0']['binary_logloss']
    
    recall += recall_score(y_val['heart_disease_present'], lgb_model.predict(X_val))


    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

#     print('Guardamos el modelo')
#     joblib.dump(lgb_model, '../saved_models/{}_{}.pkl'.format(model_name, counter))

#     ft_importances += lgb_model.feature_importances_

    counter += 1

y_preds = y_preds / (counter-1)

print('\n\nBEST SCORE MEAN:', be / (counter-1))

print('\n\nBEST RECALL SCORE MEAN:', recall / (counter-1))


Fold 1

Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[50]	valid_0's binary_logloss: 0.499734
Fold 2

Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[27]	valid_0's binary_logloss: 0.640246
Fold 3

Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.567513
Fold 4

Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[70]	valid_0's binary_logloss: 0.489527
Fold 5



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[19]	valid_0's binary_logloss: 0.633317


BEST SCORE MEAN: 0.5660674345829786


BEST RECALL SCORE MEAN: 0.7


In [46]:
y_preds[:5]

array([0.52832547, 0.15346496, 0.81883359, 0.08180145, 0.78584551])

In [47]:
sub = pd.DataFrame({'patient_id': test['patient_id'], 'heart_disease_present': y_preds})

In [48]:
sub.to_csv('../submissions/lgbm_2.csv', index=False)

In [28]:
show_prediction(lgb_model, X_val.iloc[10,:], show_feature_values=True)

Contribution?,Feature,Value
1.01,thal,0.204
0.345,count4,5.0
0.267,<BIAS>,1.0
0.21,oldpeak_eq_st_depression,-0.919
0.187,count3,32.0
0.187,serum_cholesterol_mg_per_dl,-0.09
0.128,presure_p_rate,0.672
0.083,num_major_vessels,0.0
0.015,age,1.712
0.012,count2,105.0


In [21]:
y_preds[:5]

array([0.53108446, 0.17938519, 0.83820326, 0.13060909, 0.63512043])

In [22]:
for i, j in zip(X_train.columns, lgb_model.feature_importances_):
    print(i, j)

slope_of_peak_exercise_st_segment 68
thal 88
resting_blood_pressure 73
chest_pain_type 64
num_major_vessels 34


In [23]:
vec = DictVectorizer()

In [24]:
show_weights(lgb_model)

Weight,Feature
0.5287,thal
0.1698,count1
0.1428,num_major_vessels
0.1262,chest_pain_type
0.0325,count0
