In [1]:
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
from catboost import CatBoostClassifier
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import sys

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('../data/Processed/train.csv')
sel_cols = [c for c in train.columns if c != 'patient_id']
X_train = train.loc[:,sel_cols]
y_train = pd.read_csv('../data/raw/train_labels.csv', usecols=['heart_disease_present'])
test = pd.read_csv('../data/Processed/test.csv')
X_test = test.loc[:,sel_cols]

In [3]:
X_train.head()

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,1,1,128,2,0,0,2,308,0.0,1,45,170,0
1,2,1,110,3,0,0,0,214,1.6,0,54,158,0
2,1,1,125,4,3,0,2,304,0.0,1,77,162,1
3,1,2,152,4,0,0,0,223,0.0,1,40,181,0
4,3,2,178,1,0,0,2,270,4.2,1,59,145,0


In [4]:
cat_ft_id = list()
cat_ft = ['thal', 'slope_of_peak_exercise_st_segment', 'chest_pain_type', 'sex', 'exercise_induced_angina']

n = 0
for c in sel_cols:
    if c in cat_ft:
        cat_ft_id.append(n)
    n += 1

In [5]:
model_name = 'catb_1'

params={'depth':3,
        'iterations':1000,
        'eval_metric':'Logloss',
        'random_seed':42,
        'logging_level':'Verbose',
        'allow_writing_files':False,
        'early_stopping_rounds':5,
        'learning_rate':0.02,
        'thread_count':1,
        'boosting_type':'Plain',
        'bootstrap_type':'Bernoulli',
        'rsm':0.2}

model_cb = CatBoostClassifier(**params)

In [14]:
train_ids = X_train.index
k1 = 2
k2 = 5
skf1 = StratifiedKFold(n_splits=k1, shuffle=True, random_state=42)
skf2 = StratifiedKFold(n_splits=k2, shuffle=True, random_state=42)

In [17]:
counter1 = 1
y_preds = np.zeros(X_test.shape[0])
be = 0
stack = pd.DataFrame()
stack2 = pd.DataFrame()
for train_index, test_index in skf1.split(train_ids, y_train):
    counter2 = 1
    print('Fold k1 {}\n'.format(counter1))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    X_fit_ids = X_fit.index
    
    y_preds_stack = np.zeros(X_val.shape[0])
    
    for train_index2, test_index2 in skf2.split(X_fit_ids, y_fit):
        
        print('Fold k2 {}\n'.format(counter2))
        
        X_fit2, X_val2 = X_fit.iloc[train_index2, :], X_fit.iloc[test_index2, :]
        y_fit2, y_val2 = y_fit.iloc[train_index2], y_fit.iloc[test_index2]

        model_cb.fit(X_fit,
                     y_fit,
#                      cat_features=cat_ft_id,
                     eval_set=(X_val, y_val),
                     verbose=50)

        y_preds += model_cb.predict_proba(X_test)[:,1]
        y_preds_stack += model_cb.predict_proba(X_val)[:,1]

        be += model_cb.best_score_['validation_0']['Logloss']
        
        del X_fit2
        del X_val2
        del y_fit2
        del y_val2
        del train_index2
        del test_index2
        gc.collect()

    #     print('Guardamos el modelo')
    #     joblib.dump(lgb_model, '../saved_models/{}_{}.pkl'.format(model_name, counter))

    #     ft_importances += lgb_model.feature_importances_

        counter2 += 1
        
    stack2['patient_id'] = train['patient_id'][test_index].tolist()
    stack2['heart_disease_present'] = y_preds_stack/k2
    stack = pd.concat([stack, stack2], axis = 0)
    
    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

    counter1 += 1
    
y_preds = y_preds / (k1*k2)

print('\n\nBEST SCORE MEAN:', be / (k1*k2))


Fold k1 1

Fold k2 1

0:	learn: 0.6929017	test: 0.6929017	best: 0.6929017 (0)	total: 863us	remaining: 862ms
50:	learn: 0.4515853	test: 0.5163558	best: 0.5163558 (50)	total: 46ms	remaining: 856ms
100:	learn: 0.3421630	test: 0.4579556	best: 0.4579556 (100)	total: 86.9ms	remaining: 773ms
150:	learn: 0.2767076	test: 0.4396471	best: 0.4396471 (150)	total: 127ms	remaining: 712ms
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.4360510827
bestIteration = 165

Shrink model to first 166 iterations.
Fold k2 2

0:	learn: 0.6929017	test: 0.6929017	best: 0.6929017 (0)	total: 795us	remaining: 795ms
50:	learn: 0.4515853	test: 0.5163558	best: 0.5163558 (50)	total: 48ms	remaining: 893ms
100:	learn: 0.3421630	test: 0.4579556	best: 0.4579556 (100)	total: 89.4ms	remaining: 796ms
150:	learn: 0.2767076	test: 0.4396471	best: 0.4396471 (150)	total: 131ms	remaining: 736ms
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.4360510827
bestIteration = 165

Shrink model to first 1

In [18]:
stack.head()

Unnamed: 0,patient_id,heart_disease_present
0,0z64un,0.093389
1,yt1s1x,0.79319
2,3nwy2n,0.875708
3,1r508r,0.092922
4,ldg4b9,0.386278


In [59]:
stack.to_csv('../stacking/data/train_{}.csv'.format(model_name), index=False)

In [19]:
y_preds[:5]

array([0.47096535, 0.16064737, 0.86989361, 0.14377949, 0.79674464])

In [61]:
sub = pd.DataFrame({'patient_id': test['patient_id'], 'heart_disease_present': y_preds})

In [62]:
sub.to_csv('../stacking/data/test_{}.csv'.format(model_name), index=False)

In [51]:
stack.shape, sub.shape

((180, 2), (90, 2))