In [2]:
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
import lightgbm as lgb
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import sys

import warnings
warnings.filterwarnings("ignore")

In [3]:
train = pd.read_csv('../data/Processed/train.csv')
X_train = train.loc[:,[c for c in train.columns if c != 'patient_id']]
y_train = pd.read_csv('../data/raw/train_labels.csv', usecols=['heart_disease_present'])
test = pd.read_csv('../data/Processed/test.csv')
X_test = test.loc[:,[c for c in test.columns if c != 'patient_id']]

In [4]:
train.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,0z64un,1,1,128,2,0,0,2,308,0.0,1,45,170,0
1,ryoo3j,2,1,110,3,0,0,0,214,1.6,0,54,158,0
2,yt1s1x,1,1,125,4,3,0,2,304,0.0,1,77,162,1
3,l2xjde,1,2,152,4,0,0,0,223,0.0,1,40,181,0
4,oyt4ek,3,2,178,1,0,0,2,270,4.2,1,59,145,0


In [5]:
model_name = 'lgbm_3'


params = {'max_depth':2,
          'metric':'binary_logloss',
          'num_threads': 1,
          'max_delta_step': 0.2,
          'n_estimators':10000,
          'learning_rate':0.05,
          'colsample_bytree':1,
          'objective':'binary',
          'n_jobs':-1,
          'seed':42,
          'bagging_fraction':0.8,
          'lambda_l1':0,
          'lambda_l2':0,
          'max_bin': 10}

lgb_model = lgb.LGBMClassifier(**params)

# Stacking con Folds anidados 5 a 5

In [6]:
train_ids = X_train.index
k1 = 5
k2 = 5
skf1 = StratifiedKFold(n_splits=k1, shuffle=True, random_state=42)
skf2 = StratifiedKFold(n_splits=k2, shuffle=True, random_state=42)

In [10]:
counter1 = 1
y_preds = np.zeros(X_test.shape[0])
be = 0
stack = pd.DataFrame()
stack2 = pd.DataFrame()
for train_index, test_index in skf1.split(train_ids, y_train):
    counter2 = 1
    print('Fold k1 {}\n'.format(counter1))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    X_fit_ids = X_fit.index
    
    y_preds_stack = np.zeros(X_val.shape[0])
    
    for train_index2, test_index2 in skf2.split(X_fit_ids, y_fit):
        
#         print('Fold k2 {}\n'.format(counter2))
        
        X_fit2, X_val2 = X_fit.iloc[train_index2, :], X_fit.iloc[test_index2, :]
        y_fit2, y_val2 = y_fit.iloc[train_index2], y_fit.iloc[test_index2]

        lgb_model.fit(X_fit2,
                      y_fit2,
                      eval_set=[(X_val2, y_val2)],
                      verbose=400,
                      early_stopping_rounds=5)

        y_preds += lgb_model.predict_proba(X_test)[:,1]
        y_preds_stack += lgb_model.predict_proba(X_val)[:,1]

        be += lgb_model.best_score_['valid_0']['binary_logloss']
        
        del X_fit2
        del X_val2
        del y_fit2
        del y_val2
        del train_index2
        del test_index2
        gc.collect()

    #     print('Guardamos el modelo')
    #     joblib.dump(lgb_model, '../saved_models/{}_{}.pkl'.format(model_name, counter))

    #     ft_importances += lgb_model.feature_importances_

        counter2 += 1
        
    stack2['patient_id'] = train['patient_id'][test_index].tolist()
    stack2['heart_disease_present'] = y_preds_stack/k2
    stack = pd.concat([stack, stack2], axis = 0)
    
    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

    counter1 += 1
    
y_preds = y_preds / (k1*k2)

print('\n\nBEST SCORE MEAN:', be / (k1*k2))
stack.to_csv('../stacking/data/train_{}.csv'.format(model_name), index=False)
sub = pd.DataFrame({'patient_id': test['patient_id'], 'heart_disease_present': y_preds})
sub.to_csv('../stacking/data/test_{}.csv'.format(model_name), index=False)

Fold k1 1

Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[285]	valid_0's binary_logloss: 0.400308
Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[201]	valid_0's binary_logloss: 0.540521
Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[197]	valid_0's binary_logloss: 0.434706
Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[234]	valid_0's binary_logloss: 0.409284
Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[213]	valid_0's binary_logloss: 0.490528
Fold k1 2

Training until validation scores don't improve for 5 rounds.
[400]	valid_0's binary_logloss: 0.267415
Early stopping, best iteration is:
[438]	valid_0's binary_logloss: 0.253199
Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[255]	valid_0's bin

In [8]:
counter1 = 1
y_preds = np.zeros(X_test.shape[0])
be = 0
stack = pd.DataFrame()
for train_index, test_index in skf1.split(train_ids, y_train):
    stack2 = pd.DataFrame()
    counter2 = 1
    print('Fold k1 {}\n'.format(counter1))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    X_fit_ids = X_fit.index
    
    y_preds_stack = np.zeros(X_val.shape[0])
    
    for train_index2, test_index2 in skf2.split(X_fit_ids, y_fit):
        
        print('Fold k2 {}\n'.format(counter2))
        
        X_fit2, X_val2 = X_fit.iloc[train_index2, :], X_fit.iloc[test_index2, :]
        y_fit2, y_val2 = y_fit.iloc[train_index2], y_fit.iloc[test_index2]

        lgb_model.fit(X_fit2,
                      y_fit2,
                      eval_set=[(X_val2, y_val2)],
                      verbose=400,
                      early_stopping_rounds=5)

        y_preds += lgb_model.predict_proba(X_test)[:,1]
        y_preds_stack += lgb_model.predict_proba(X_val)[:,1]

        be += lgb_model.best_score_['valid_0']['binary_logloss']
        
        del X_fit2
        del X_val2
        del y_fit2
        del y_val2
        del train_index2
        del test_index2
        gc.collect()

    #     print('Guardamos el modelo')
    #     joblib.dump(lgb_model, '../saved_models/{}_{}.pkl'.format(model_name, counter))

    #     ft_importances += lgb_model.feature_importances_

        counter2 += 1
        
    stack2['patient_id'] = train['patient_id'][test_index].tolist()
    stack2['heart_disease_present'] = y_preds_stack/k2
    stack = pd.concat([stack, stack2], axis = 0)
    
    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

    counter1 += 1
    
y_preds = y_preds / (k1*k2)

print('\n\nBEST SCORE MEAN:', be / (k1*k2))


Fold k1 1

Fold k2 1

Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[285]	valid_0's binary_logloss: 0.400308
Fold k2 2

Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[201]	valid_0's binary_logloss: 0.540521
Fold k2 3

Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[197]	valid_0's binary_logloss: 0.434706
Fold k2 4

Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[234]	valid_0's binary_logloss: 0.409284
Fold k2 5

Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[213]	valid_0's binary_logloss: 0.490528
Fold k1 2

Fold k2 1

Training until validation scores don't improve for 5 rounds.
[400]	valid_0's binary_logloss: 0.267415
Early stopping, best iteration is:
[438]	valid_0's binary_logloss: 0.253199
Fold k2 2

Training until validation scores don't

In [117]:
stack.shape

(180, 2)

In [118]:
stack.head()

Unnamed: 0,patient_id,heart_disease_present
0,0z64un,0.164153
1,yt1s1x,0.620082
2,3nwy2n,0.810374
3,cvux3j,0.308505
4,lek9q9,0.202178


In [9]:
stack.to_csv('../stacking/data/train_{}.csv'.format(model_name), index=False)

In [119]:
y_preds[:5]

array([0.4990003 , 0.25349427, 0.81547985, 0.27348816, 0.76755211])

In [11]:
sub = pd.DataFrame({'patient_id': test['patient_id'], 'heart_disease_present': y_preds})

In [12]:
sub.to_csv('../stacking/data/test_{}.csv'.format(model_name), index=False)