# Librerías

In [8]:
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.externals import joblib
from catboost import CatBoostClassifier
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import sys

import warnings
warnings.filterwarnings("ignore")

# Datos

In [9]:
train = pd.read_csv('../data/Processed/train.csv')
sel_cols = [c for c in train.columns if c != 'patient_id']
X_train = train.loc[:,sel_cols]
y_train = pd.read_csv('../data/raw/train_labels.csv', usecols=['heart_disease_present'])
test = pd.read_csv('../data/Processed/test.csv')
X_test = test.loc[:,sel_cols]

In [10]:
X_train.head()

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,1,1,128,2,0,0,2,308,0.0,1,45,170,0
1,2,1,110,3,0,0,0,214,1.6,0,54,158,0
2,1,1,125,4,3,0,2,304,0.0,1,77,162,1
3,1,2,152,4,0,0,0,223,0.0,1,40,181,0
4,3,2,178,1,0,0,2,270,4.2,1,59,145,0


In [11]:
cat_ft_id = list()
cat_ft = ['thal', 'chest_pain_type', 'sex']

n = 0
for c in sel_cols:
    if c in cat_ft:
        cat_ft_id.append(n)
    n += 1

# Random Search

Para la búsqueda de la mejor combinación de parámetros para este problema.

In [92]:
params = {
    'depth':np.arange(3, 7, 1),
    'iterations':[500],
    'eval_metric':['Logloss'],
    'random_seed':[42],
    'verbose':[0],
    'allow_writing_files':[False],
    'early_stopping_rounds':[5],
    'learning_rate':np.arange(0.01, 0.2, 0.01),
    'thread_count':[1],
    'boosting_type':['Plain'],
    'bootstrap_type':['Bernoulli'],
    'colsample_bylevel':np.arange(0.1, 1.1, 0.1),
    'l2_leaf_reg':np.arange(0, 10, 1),
    'border_count':np.arange(1, 255, 5),
    'max_delta_step':np.arange(1, 11, 1)
}

model_cb = CatBoostClassifier()

rscv = RandomizedSearchCV(model_cb,
                          params,
                          verbose=1,
                          random_state=42,
                          scoring='roc_auc',
                          n_iter=60,
                          cv = 5,
                          n_jobs=-1)

In [93]:
rscv.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   38.8s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=<catboost.core.CatBoostClassifier object at 0x7fd5e9ea5dd8>,
          fit_params=None, iid='warn', n_iter=60, n_jobs=-1,
          param_distributions={'depth': array([3, 4, 5, 6]), 'iterations': [500], 'eval_metric': ['Logloss'], 'random_seed': [42], 'verbose': [0], 'allow_writing_files': [False], 'early_stopping_rounds': [5], 'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12..., 166, 171, 176, 181, 186, 191,
       196, 201, 206, 211, 216, 221, 226, 231, 236, 241, 246, 251])},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=1)

## Mejores parámetros y mejor score

In [94]:
print(rscv.best_params_,'\n')
print(rscv.best_score_)

{'verbose': 0, 'thread_count': 1, 'random_seed': 42, 'learning_rate': 0.01, 'l2_leaf_reg': 5, 'iterations': 500, 'eval_metric': 'Logloss', 'early_stopping_rounds': 5, 'depth': 4, 'colsample_bylevel': 0.4, 'border_count': 216, 'bootstrap_type': 'Bernoulli', 'boosting_type': 'Plain', 'allow_writing_files': False} 

0.901875


In [95]:
model_name = 'catb_1'

# params={'verbose': 0, 'thread_count': 1, 'random_seed': 42,
#         'learning_rate': 0.04, 'l2_leaf_reg': 5, 'iterations': 1000,
#         'eval_metric': 'Logloss', 'early_stopping_rounds': 5,
#         'depth': 5, 'colsample_bylevel': 0.1, 'border_count': 231,
#         'bootstrap_type': 'Bernoulli', 'boosting_type': 'Plain', 'allow_writing_files': False}

params = rscv.best_params_

model_cb = CatBoostClassifier(**params)

# Entrenamos modelo de CatBoost final

Guardamos los resultados para hacer Stacking

In [96]:
train_ids = X_train.index
k1 = 5
k2 = 5
skf1 = StratifiedKFold(n_splits=k1, shuffle=True, random_state=42)
skf2 = StratifiedKFold(n_splits=k2, shuffle=True, random_state=42)

In [107]:
counter1 = 1
y_preds = np.zeros(X_test.shape[0])
be = 0
stack = pd.DataFrame()
stack2 = pd.DataFrame()
for train_index, test_index in skf1.split(train_ids, y_train):
    counter2 = 1
    print('Fold k1 {}\n'.format(counter1))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    X_fit_ids = X_fit.index
    
    y_preds_stack = np.zeros(X_val.shape[0])
    
    for train_index2, test_index2 in skf2.split(X_fit_ids, y_fit):
        
#         print('Fold k2 {}\n'.format(counter2))
        
        X_fit2, X_val2 = X_fit.iloc[train_index2, :], X_fit.iloc[test_index2, :]
        y_fit2, y_val2 = y_fit.iloc[train_index2], y_fit.iloc[test_index2]

        model_cb.fit(X_fit2,
                     y_fit2,
#                      cat_features=cat_ft_id,
                     eval_set=(X_val2, y_val2),
                     verbose=0)

        y_preds += model_cb.predict_proba(X_test)[:,1]
        y_preds_stack += model_cb.predict_proba(X_val)[:,1]

        be += model_cb.best_score_['validation_0']['Logloss']
        
        del X_fit2
        del X_val2
        del y_fit2
        del y_val2
        del train_index2
        del test_index2
        gc.collect()

    #     print('Guardamos el modelo')
    #     joblib.dump(lgb_model, '../saved_models/{}_{}.pkl'.format(model_name, counter))

    #     ft_importances += lgb_model.feature_importances_

        counter2 += 1
        
    stack2['patient_id'] = train['patient_id'][test_index].tolist()
    stack2['heart_disease_present'] = y_preds_stack/k2
    stack = pd.concat([stack, stack2], axis = 0)
    
    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

    counter1 += 1
    
y_preds = y_preds / (k1*k2)

print('\n\nBEST SCORE MEAN:', be / (k1*k2))
stack.to_csv('../stacking/data/train_{}.csv'.format(model_name), index=False)
sub = pd.DataFrame({'patient_id': test['patient_id'], 'heart_disease_present': y_preds})
sub.to_csv('../stacking/data/test_{}.csv'.format(model_name), index=False)

Fold k1 1

Fold k1 2

Fold k1 3

Fold k1 4

Fold k1 5



BEST SCORE MEAN: 0.427263915761494


In [106]:
stack.head(10)

Unnamed: 0,patient_id,heart_disease_present
0,0z64un,0.165663
1,yt1s1x,0.677301
2,3nwy2n,0.834165
3,1r508r,0.141079
4,cvux3j,0.186239
5,jhdvtb,0.292091
6,lek9q9,0.243899
7,8265rl,0.227086
8,k7ef7h,0.755893
9,xkdz7j,0.412519


In [99]:
stack.to_csv('../stacking/data/train_{}.csv'.format(model_name), index=False)

In [100]:
y_preds[:5]

array([0.44378202, 0.21548323, 0.84147298, 0.17008381, 0.74211812])

In [101]:
sub = pd.DataFrame({'patient_id': test['patient_id'], 'heart_disease_present': y_preds})

In [62]:
sub.to_csv('../stacking/data/test_{}.csv'.format(model_name), index=False)

In [51]:
stack.shape, sub.shape

((180, 2), (90, 2))