In [1]:
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
from catboost import CatBoostClassifier
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import sys

import warnings
warnings.filterwarnings("ignore")

In [52]:
train = pd.read_csv('../data/Processed/train.csv')
sel_cols = [c for c in train.columns if c != 'patient_id']
X_train = train.loc[:,sel_cols]
y_train = pd.read_csv('../data/raw/train_labels.csv', usecols=['heart_disease_present'])
test = pd.read_csv('../data/Processed/test.csv')
X_test = test.loc[:,sel_cols]

In [53]:
X_train.head()

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,count0,count1,count2,count3,count4,presure_p_rate,cholesterol_p_age
0,0.0,0.204082,0.395349,0.333333,0.0,0.0,1.0,0.415525,0.0,1.0,0.333333,0.698113,0.0,1.0,0.75,1.0,0.508475,0.333333,0.573669,0.249247
1,0.5,0.204082,0.186047,0.666667,0.0,0.0,0.0,0.200913,0.258065,0.0,0.520833,0.584906,0.0,0.537634,0.680556,0.597938,0.864407,0.416667,0.328291,0.176958
2,0.0,0.204082,0.360465,1.0,1.0,0.0,1.0,0.406393,0.0,1.0,1.0,0.622642,1.0,1.0,0.75,0.175258,0.186441,0.0,0.489076,0.54882
3,0.0,0.756757,0.674419,1.0,0.0,0.0,0.0,0.221461,0.0,1.0,0.229167,0.801887,0.0,0.354839,1.0,1.0,1.0,0.166667,0.89591,0.094252
4,1.0,0.756757,0.976744,0.0,0.0,0.0,1.0,0.328767,0.677419,1.0,0.625,0.462264,0.0,0.096774,0.138889,0.020619,0.20339,0.0,0.80056,0.314194


In [54]:
cat_ft_id = list()
cat_ft = ['thal', 'slope_of_peak_exercise_st_segment', 'chest_pain_type', 'sex', 'exercise_induced_angina']

n = 0
for c in sel_cols:
    if c in cat_ft:
        cat_ft_id.append(n)
    n += 1

In [55]:
model_name = 'catb_1'

params={'depth':3,
        'iterations':1000,
        'eval_metric':'Logloss',
        'random_seed':42,
        'logging_level':'Verbose',
        'allow_writing_files':False,
        'early_stopping_rounds':5,
        'learning_rate':0.02,
        'thread_count':1,
        'boosting_type':'Plain',
        'bootstrap_type':'Bernoulli',
        'rsm':0.2}

model_cb = CatBoostClassifier(**params)

In [56]:
train_ids = X_train.index
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [57]:
counter = 1
y_preds = np.zeros(X_test.shape[0])
be = 0
stack = pd.DataFrame()
stack2 = pd.DataFrame()
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    model_cb.fit(X_fit,
                 y_fit,
#                  cat_features=cat_ft_id,
                 eval_set=(X_val, y_val),
                 verbose=50)
    
    y_preds += model_cb.predict_proba(X_test)[:,1]
    stack2['patient_id'] = train['patient_id'][test_index].tolist()
    stack2['heart_disease_present'] = model_cb.predict_proba(X_val)[:,1]
    stack = pd.concat([stack, stack2], axis = 0)
    
    be += model_cb.best_score_['validation_0']['Logloss']

#     del X_fit
#     del X_val
#     del y_fit
#     del y_val
#     del train_index
#     del test_index
#     gc.collect()

#     print('Guardamos el modelo')
#     joblib.dump(lgb_model, '../saved_models/{}_{}.pkl'.format(model_name, counter))

#     ft_importances += lgb_model.feature_importances_

    counter += 1

y_preds = y_preds / (counter-1)

print('\n\nBEST SCORE MEAN:', be / (counter-1))


Fold 1

0:	learn: 0.6899016	test: 0.6891135	best: 0.6891135 (0)	total: 1.11ms	remaining: 1.11s
50:	learn: 0.4716132	test: 0.4586665	best: 0.4586665 (50)	total: 40.8ms	remaining: 759ms
100:	learn: 0.3927779	test: 0.3788832	best: 0.3788832 (100)	total: 77.6ms	remaining: 691ms
150:	learn: 0.3421296	test: 0.3464912	best: 0.3464912 (150)	total: 114ms	remaining: 640ms
200:	learn: 0.3071405	test: 0.3232783	best: 0.3232783 (200)	total: 150ms	remaining: 595ms
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.3136628614
bestIteration = 221

Shrink model to first 222 iterations.
Fold 2

0:	learn: 0.6889495	test: 0.6914270	best: 0.6914270 (0)	total: 742us	remaining: 741ms
50:	learn: 0.4164471	test: 0.5702160	best: 0.5702160 (50)	total: 39.9ms	remaining: 743ms
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.56202773
bestIteration = 61

Shrink model to first 62 iterations.
Fold 3

0:	learn: 0.6876545	test: 0.6916667	best: 0.6916667 (0)	total: 740us	remaining: 739m

In [58]:
stack.head()

Unnamed: 0,patient_id,heart_disease_present
0,0z64un,0.121738
1,yt1s1x,0.675045
2,3nwy2n,0.877981
3,1r508r,0.086977
4,cvux3j,0.12144


In [59]:
stack.to_csv('../stacking/data/train_{}.csv'.format(model_name), index=False)

In [60]:
y_preds[:5]

array([0.47158   , 0.14317648, 0.80628633, 0.19114973, 0.7906737 ])

In [61]:
sub = pd.DataFrame({'patient_id': test['patient_id'], 'heart_disease_present': y_preds})

In [62]:
sub.to_csv('../stacking/data/test_{}.csv'.format(model_name), index=False)

In [51]:
stack.shape, sub.shape

((180, 2), (90, 2))