In [42]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelBinarizer
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import xgboost as xgb

In [2]:
train = pd.read_csv('../data/Processed/train.csv')
X_train = train.loc[:,[c for c in train.columns if c != 'patient_id']]
y_train = pd.read_csv('../data/raw/train_labels.csv', usecols=['heart_disease_present'])
test = pd.read_csv('../data/Processed/test.csv')
X_test = test.loc[:,[c for c in test.columns if c != 'patient_id']]

In [3]:
X_train.head()

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,1,1,128,2,0,0,2,308,0.0,1,45,170,0
1,2,1,110,3,0,0,0,214,1.6,0,54,158,0
2,1,1,125,4,3,0,2,304,0.0,1,77,162,1
3,1,2,152,4,0,0,0,223,0.0,1,40,181,0
4,3,2,178,1,0,0,2,270,4.2,1,59,145,0


In [4]:
train_stack_list = [
    'train_catb_1.csv',
    'train_keras.csv',
    'train_lgbm_3.csv',
    'train_rf_1.csv',
    'train_xgb_2.csv',
]

test_stack_list = [
    'test_catb_1.csv',
    'test_keras.csv',
    'test_lgbm_3.csv',
    'test_rf_1.csv',
    'test_xgb_2.csv',
]

In [13]:
main0 = pd.read_csv('../stacking/data/{}'.format(train_stack_list[0]))

In [14]:
main1 = pd.read_csv('../stacking/data/{}'.format(train_stack_list[1]))

In [16]:
main0.join(main1.set_index('patient_id'), on='patient_id', rsuffix='2').head()

Unnamed: 0,patient_id,heart_disease_present,heart_disease_present2
0,0z64un,0.165663,0.048076
1,yt1s1x,0.677301,0.598187
2,3nwy2n,0.834165,0.876046
3,1r508r,0.141079,0.090756
4,cvux3j,0.186239,0.037099


In [19]:
main_train = pd.read_csv('../stacking/data/{}'.format(train_stack_list[0]))
for i in range(1, len(train_stack_list)):
    main0 = pd.read_csv('../stacking/data/{}'.format(train_stack_list[i]))
    main_train = main_train.join(main0.set_index('patient_id'), on='patient_id', rsuffix=f'_{i}')

In [21]:
main_train.head()

Unnamed: 0,patient_id,heart_disease_present,heart_disease_present_1,heart_disease_present_2,heart_disease_present_3,heart_disease_present_4
0,0z64un,0.165663,0.048076,0.134946,0.092259,0.180913
1,yt1s1x,0.677301,0.598187,0.606524,0.554173,0.609941
2,3nwy2n,0.834165,0.876046,0.855298,0.807424,0.861866
3,1r508r,0.141079,0.090756,0.108107,0.062726,0.155135
4,cvux3j,0.186239,0.037099,0.307902,0.183188,0.203478


In [22]:
main_test = pd.read_csv('../stacking/data/{}'.format(test_stack_list[0]))
for i in range(1, len(train_stack_list)):
    main0 = pd.read_csv('../stacking/data/{}'.format(test_stack_list[i]))
    main_test = main_test.join(main0.set_index('patient_id'), on='patient_id', rsuffix=f'_{i}')

In [23]:
main_test.head()

Unnamed: 0,patient_id,heart_disease_present,heart_disease_present_1,heart_disease_present_2,heart_disease_present_3,heart_disease_present_4
0,olalu7,0.452141,0.494247,0.466422,0.458839,0.5048
1,z9n6mx,0.16904,0.199757,0.252154,0.133021,0.215033
2,5k4413,0.84345,0.857324,0.833522,0.870108,0.787661
3,mrg7q5,0.173172,0.101676,0.242351,0.237691,0.272594
4,uki4do,0.800751,0.847335,0.784443,0.783636,0.870435


In [31]:
train_stack = train.join(main_train.set_index('patient_id'), on='patient_id')
test_stack = test.join(main_test.set_index('patient_id'), on='patient_id')
X_train = train_stack.loc[:,[c for c in train_stack.columns if c != 'patient_id']]
X_test = test_stack.loc[:,[c for c in test_stack.columns if c != 'patient_id']]

In [32]:
X_train.head()

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,heart_disease_present,heart_disease_present_1,heart_disease_present_2,heart_disease_present_3,heart_disease_present_4
0,1,1,128,2,0,0,2,308,0.0,1,45,170,0,0.165663,0.048076,0.134946,0.092259,0.180913
1,2,1,110,3,0,0,0,214,1.6,0,54,158,0,0.156811,0.101864,0.128856,0.127684,0.147777
2,1,1,125,4,3,0,2,304,0.0,1,77,162,1,0.677301,0.598187,0.606524,0.554173,0.609941
3,1,2,152,4,0,0,0,223,0.0,1,40,181,0,0.39382,0.31256,0.419402,0.31311,0.344652
4,3,2,178,1,0,0,2,270,4.2,1,59,145,0,0.727382,0.80928,0.785368,0.766442,0.814253


In [43]:
params = {'max_depth': [5],
          'metric': ['binary_logloss'],
          'n_estimators': [500],
          'eta': np.arange(0.01, 0.1, 0.01),
          'colsample_bytree':np.arange(0.1, 1.1, 0.1),
          'n_jobs': [1],
          'seed': [42],
          'gamma': np.arange(0, 0.9, 0.1),
          'max_bin': np.arange(2, 128, 2),
          'objective': ['binary:logistic'],
          'gamma':np.arange(0, 10, 0.1),
          'alpha':np.arange(0, 1.1, 0.1),
          'lambda':np.arange(0,1.1, 0.1),
          'max_delta_step':np.arange(0, 11, 1)
         }

xgb_model = xgb.XGBClassifier()

rscv = RandomizedSearchCV(xgb_model,
                          params,
                          verbose=1,
                          random_state=42,
                          scoring='roc_auc',
                          n_iter=60,
                          cv = 5,
                          n_jobs=-1)

rscv.fit(X_train, y_train)

print(rscv.best_params_,'\n')
print(rscv.best_score_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.3s


{'seed': 42, 'objective': 'binary:logistic', 'n_jobs': 1, 'n_estimators': 500, 'metric': 'binary_logloss', 'max_depth': 5, 'max_delta_step': 9, 'max_bin': 56, 'lambda': 0.4, 'gamma': 2.0, 'eta': 0.01, 'colsample_bytree': 0.1, 'alpha': 0.7000000000000001} 

0.8884375


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    7.6s finished


In [44]:
params = rscv.best_params_
xgb_model = xgb.XGBClassifier(**params)

train_ids = X_train.index
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [50]:
counter = 1
y_preds = np.zeros(X_test.shape[0])
be = 0
log_loss_score = 0
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    xgb_model.fit(X_fit,
                     y_fit,
                     eval_set=[(X_val, y_val)],
                     verbose=100,
                  early_stopping_rounds=5,
                 eval_metric='logloss')

    y_preds += xgb_model.predict_proba(X_test)[:,1]
    
    
#     y_val = lb.fit_transform(y_val)
#     log_loss_score += log_loss(y_val, lr_model.predict_proba(X_val)[:,1])

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

#     print('Guardamos el modelo')
#     joblib.dump(lgb_model, '../saved_models/{}_{}.pkl'.format(model_name, counter))

#     ft_importances += lgb_model.feature_importances_

    counter += 1

y_preds = y_preds / (counter-1)

# print('\n\nBEST SCORE MEAN:', be / (counter-1))
print('\n\nBEST LOG_LOSS SCORE MEAN:', log_loss_score / (counter-1))

Fold 1

[0]	validation_0-logloss:0.646542
Will train until validation_0-logloss hasn't improved in 5 rounds.
Stopping. Best iteration:
[47]	validation_0-logloss:0.378452

Fold 2

[0]	validation_0-logloss:0.656488
Will train until validation_0-logloss hasn't improved in 5 rounds.
Stopping. Best iteration:
[12]	validation_0-logloss:0.554954

Fold 3

[0]	validation_0-logloss:0.655671
Will train until validation_0-logloss hasn't improved in 5 rounds.
Stopping. Best iteration:
[49]	validation_0-logloss:0.395557

Fold 4

[0]	validation_0-logloss:0.656535
Will train until validation_0-logloss hasn't improved in 5 rounds.
Stopping. Best iteration:
[55]	validation_0-logloss:0.365735

Fold 5

[0]	validation_0-logloss:0.655872
Will train until validation_0-logloss hasn't improved in 5 rounds.
Stopping. Best iteration:
[20]	validation_0-logloss:0.517781



BEST LOG_LOSS SCORE MEAN: 0.0


In [33]:
model_name = 'lr_1'

params = {'tol':0.001,
          'C':1,
          'max_iter':10000,
          'n_jobs':1,
          'verbose':5,
          'random_state': 42,
          'penalty': 'l2'}


lr_model = LogisticRegression(**params)

In [34]:
train_ids = X_train.index
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [35]:
lb = LabelBinarizer()

In [36]:
counter = 1
y_preds = np.zeros(X_test.shape[0])
be = 0
log_loss_score = 0
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    lr_model.fit(X_fit,
                  y_fit)
    
    
    y_preds += lr_model.predict_proba(X_test)[:,1]
    
    y_val = lb.fit_transform(y_val)
    log_loss_score += log_loss(y_val, lr_model.predict_proba(X_val)[:,1])

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

#     print('Guardamos el modelo')
#     joblib.dump(lgb_model, '../saved_models/{}_{}.pkl'.format(model_name, counter))

#     ft_importances += lgb_model.feature_importances_

    counter += 1

y_preds = y_preds / (counter-1)

# print('\n\nBEST SCORE MEAN:', be / (counter-1))
print('\n\nBEST LOG_LOSS SCORE MEAN:', log_loss_score / (counter-1))

Fold 1

[LibLinear]Fold 2

[LibLinear]Fold 3

[LibLinear]

BEST LOG_LOSS SCORE MEAN: 0.46795401084693244


In [51]:
y_preds[:5]

array([0.47717623, 0.21988214, 0.82216154, 0.22024838, 0.85706072])

In [52]:
sub = pd.DataFrame({'patient_id': test['patient_id'], 'heart_disease_present': y_preds})

In [53]:
sub.to_csv('../submissions/stacking_4.csv', index=False)