In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelBinarizer
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import sys

In [43]:
train = pd.read_csv('../data/Processed/train.csv')
X_train = train.loc[:,[c for c in train.columns if c != 'patient_id']]
y_train = pd.read_csv('../data/raw/train_labels.csv', usecols=['heart_disease_present'])
test = pd.read_csv('../data/Processed/test.csv')
X_test = test.loc[:,[c for c in test.columns if c != 'patient_id']]

In [44]:
X_train.head()

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,count0,count1,count2,count3,count4,presure_p_rate,cholesterol_p_age
0,0.0,0.204082,0.395349,0.333333,0.0,0.0,1.0,0.415525,0.0,1.0,0.333333,0.698113,0.0,1.0,0.75,1.0,0.508475,0.333333,0.573669,0.249247
1,0.5,0.204082,0.186047,0.666667,0.0,0.0,0.0,0.200913,0.258065,0.0,0.520833,0.584906,0.0,0.537634,0.680556,0.597938,0.864407,0.416667,0.328291,0.176958
2,0.0,0.204082,0.360465,1.0,1.0,0.0,1.0,0.406393,0.0,1.0,1.0,0.622642,1.0,1.0,0.75,0.175258,0.186441,0.0,0.489076,0.54882
3,0.0,0.756757,0.674419,1.0,0.0,0.0,0.0,0.221461,0.0,1.0,0.229167,0.801887,0.0,0.354839,1.0,1.0,1.0,0.166667,0.89591,0.094252
4,1.0,0.756757,0.976744,0.0,0.0,0.0,1.0,0.328767,0.677419,1.0,0.625,0.462264,0.0,0.096774,0.138889,0.020619,0.20339,0.0,0.80056,0.314194


In [45]:
train_stack_list = [
    'train_catb_1.csv',
    'train_keras.csv',
    'train_lgbm_3.csv',
    'train_rf_1.csv',
    'train_xgb_2.csv',
]

test_stack_list = [
    'test_catb_1.csv',
    'test_keras.csv',
    'test_lgbm_3.csv',
    'test_rf_1.csv',
    'test_xgb_2.csv',
]

In [46]:
list_ = list()
main = pd.read_csv('../stacking/data/{}'.format(train_stack_list[0]))
list_.append(main)
for file_ in train_stack_list[1:]:
    df = pd.read_csv('../stacking/data/{}'.format(file_), usecols=['heart_disease_present'])
    list_.append(df)

X_train_stack = pd.concat(list_, axis = 1, ignore_index = True)

In [47]:
X_train_stack.head()

Unnamed: 0,0,1,2,3,4,5
0,0z64un,0.121738,0.100189,0.168071,0.092259,0.021424
1,yt1s1x,0.675045,0.889463,0.553864,0.554173,0.717458
2,3nwy2n,0.877981,0.910203,0.842428,0.807424,0.956777
3,1r508r,0.086977,0.154261,0.085671,0.062726,0.05291
4,cvux3j,0.12144,0.134719,0.339267,0.183188,0.12786


In [48]:
list_ = list()
main = pd.read_csv('../stacking/data/{}'.format(test_stack_list[0]))
list_.append(main)
for file_ in test_stack_list[1:]:
    df = pd.read_csv('../stacking/data/{}'.format(file_), usecols=['heart_disease_present'])
    list_.append(df)

X_test_stack = pd.concat(list_, axis = 1, ignore_index = True)

In [49]:
X_test_stack.head()

Unnamed: 0,0,1,2,3,4,5
0,olalu7,0.47158,0.326165,0.495893,0.458839,0.392954
1,z9n6mx,0.143176,0.065831,0.268892,0.133021,0.157838
2,5k4413,0.806286,0.815695,0.813633,0.870108,0.923806
3,mrg7q5,0.19115,0.148431,0.258783,0.237691,0.337304
4,uki4do,0.790674,0.844193,0.714119,0.783636,0.856722


In [50]:
X_train = pd.concat([X_train, X_train_stack.iloc[:,1:]], axis = 1, ignore_index = True)
X_test = pd.concat([X_test, X_test_stack.iloc[:,1:]], axis = 1, ignore_index = True)

In [64]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.0,0.204082,0.395349,0.333333,0.0,0.0,1.0,0.415525,0.0,1.0,...,1.0,0.508475,0.333333,0.573669,0.249247,0.121738,0.100189,0.168071,0.092259,0.021424
1,0.5,0.204082,0.186047,0.666667,0.0,0.0,0.0,0.200913,0.258065,0.0,...,0.597938,0.864407,0.416667,0.328291,0.176958,0.675045,0.889463,0.553864,0.554173,0.717458
2,0.0,0.204082,0.360465,1.0,1.0,0.0,1.0,0.406393,0.0,1.0,...,0.175258,0.186441,0.0,0.489076,0.54882,0.877981,0.910203,0.842428,0.807424,0.956777
3,0.0,0.756757,0.674419,1.0,0.0,0.0,0.0,0.221461,0.0,1.0,...,1.0,1.0,0.166667,0.89591,0.094252,0.086977,0.154261,0.085671,0.062726,0.05291
4,1.0,0.756757,0.976744,0.0,0.0,0.0,1.0,0.328767,0.677419,1.0,...,0.020619,0.20339,0.0,0.80056,0.314194,0.12144,0.134719,0.339267,0.183188,0.12786


In [59]:
model_name = 'lr_1'

params = {'tol':0.001,
          'C':1,
          'max_iter':10000,
          'n_jobs':1,
          'verbose':5,
          'random_state': 42,
          'penalty': 'l2'}


lr_model = LogisticRegression(**params)

In [60]:
train_ids = X_train.index
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [61]:
lb = LabelBinarizer()

In [62]:
counter = 1
y_preds = np.zeros(X_test.shape[0])
be = 0
log_loss_score = 0
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    lr_model.fit(X_fit,
                  y_fit)
    
    
    y_preds += lr_model.predict_proba(X_test)[:,1]
    
    y_val = lb.fit_transform(y_val)
    log_loss_score += log_loss(y_val, lr_model.predict_proba(X_val)[:,1])

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

#     print('Guardamos el modelo')
#     joblib.dump(lgb_model, '../saved_models/{}_{}.pkl'.format(model_name, counter))

#     ft_importances += lgb_model.feature_importances_

    counter += 1

y_preds = y_preds / (counter-1)

# print('\n\nBEST SCORE MEAN:', be / (counter-1))
print('\n\nBEST LOG_LOSS SCORE MEAN:', log_loss_score / (counter-1))

Fold 1

[LibLinear]Fold 2

[LibLinear]Fold 3

[LibLinear]

BEST LOG_LOSS SCORE MEAN: 0.4487058914449003


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [63]:
y_preds[:5]

array([0.30375904, 0.08114187, 0.82467503, 0.14627415, 0.82615762])

In [65]:
sub = pd.DataFrame({'patient_id': test['patient_id'], 'heart_disease_present': y_preds})

In [66]:
sub.to_csv('../submissions/stacking_1.csv', index=False)