## Findings

* 7 Models Finished Rank 2
* 6 were XG Boost w/ Weather Data
* 1 was an ensemble ((y_hat_xgb^1 + y_hat_xgb^2 + y_hat_xgb^3 +y_hat_xgb^4 )/4 ) * .75 + events * .25
* Most submisions were rank 4 including a plain Logistic Regression, and top public leaderboard submission
* Trust your Cross Validation

A blog post on Medium describes how you can use powers to improve ROC in ensembles:
[Reaching the depths of (power/geometric) ensembling when targeting the AUC metric](https://medium.com/data-design/reaching-the-depths-of-power-geometric-ensembling-when-targeting-the-auc-metric-2f356ea3250e)

In [0]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
eider.env.getUploaded("weather.png", "/tmp/weather.png")
img=mpimg.imread('/tmp/weather.png')

plt.figure(figsize=(20, 15))
imgplot = plt.imshow(img)
plt.show()

In [0]:
eider.env.getUploaded("xgboost.jpg", "/tmp/xgboost.jpg")
img=mpimg.imread('/tmp/xgboost.jpg')
imgplot = plt.imshow(img)
plt.show()

In [0]:
############################################################
## Download Data from S3
############################################################

## Training Data
eider.env.getUploaded("WFS_Training.csv", "/tmp/WFS_Training.csv")

## Test Data
eider.env.getUploaded("WFS_TestFeatures.csv", "/tmp/WFS_TestFeatures.csv")

## Weather Data
eider.env.getUploaded("weather.csv", "/tmp/weather.csv")

###############################################################################
## Imports
###############################################################################
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

## Models
#from catboost import CatBoostClassifier ## Not in Eider =(
#import lightgbm as lgb ## Not in Eider =(
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

## Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

## processing
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.decomposition import PCA

## Other
from datetime import datetime
import os
import pickle
from sklearn.model_selection import KFold

###############################################################################
## settings, functions and globals
###############################################################################
pd.set_option('display.max_columns', None)

PATH = 'tmp/'

target = 'nhenoshow_flag'

def plot_category_percent_of_target(data, col):
    fig, ax = plt.subplots(1, 1, figsize=(15, 10))
    cat_percent = data[[col, target]].groupby(col, as_index=False).mean()
    cat_size    = data[col].value_counts().reset_index(drop=False)
    cat_size.columns = [col, 'count']
    cat_percent = cat_percent.merge(cat_size, on=col, how='left')
    cat_percent[target] = cat_percent[target].fillna(0)
    cat_percent = cat_percent.sort_values(by='count', ascending=False)[:20]
    sns.barplot(ax=ax, x=target, y=col, data=cat_percent, order=cat_percent[col])

    for i, p in enumerate(ax.patches):
        ax.annotate('{}'.format(cat_percent['count'].values[i]), (p.get_width(), p.get_y()+0.5), fontsize=20)

    plt.xlabel('% of ' + target + '(target)')
    plt.ylabel(col)
    #plt.savefig(PATH + 'category_percent.' + str(f) + '.jpg')
    plt.show()
    plt.close()

def fix_shift_days_week(df):
    df.loc[df.shift_days_of_week.isnull(),'shift_days_of_week'] = ''
    df['shift_days_of_week'] = df.shift_days_of_week.str.replace(' ', '')
    df['shift_days_of_week'] = df.shift_days_of_week.str.lower()
    
    df['shift_days_of_week'] = df.shift_days_of_week.str.replace('thursday' , 'thu')
    df['shift_days_of_week'] = df.shift_days_of_week.str.replace('friday'   , 'fri')
    df['shift_days_of_week'] = df.shift_days_of_week.str.replace('saturday' , 'sat')
    df['shift_days_of_week'] = df.shift_days_of_week.str.replace('sunday'   , 'sun')
    df['shift_days_of_week'] = df.shift_days_of_week.str.replace('monday'   , 'mon')
    df['shift_days_of_week'] = df.shift_days_of_week.str.replace('tuesday'  , 'tue')
    df['shift_days_of_week'] = df.shift_days_of_week.str.replace('wednesday', 'wed')
    
    df['shift_days_of_week'] = df.shift_days_of_week.str.replace('tues', 'tue')
    
    ## remove slashes
    df.loc[df.shift_days_of_week=='m/tu/th/f', 'shift_days_of_week'] = 'mon,tue,thu,fri'
    df.loc[df.shift_days_of_week=='w/th/f/su', 'shift_days_of_week'] = 'wed,thu,fri,sun'
    df.loc[df.shift_days_of_week=='m/t/th/f', 'shift_days_of_week'] = 'mon,tue,thu,fri'
    df.loc[df.shift_days_of_week=='m/tu/th/f/sa', 'shift_days_of_week'] = 'mon,tue,thu,fri,sat'
    df.loc[df.shift_days_of_week=='mon-wed/f/sa', 'shift_days_of_week'] = 'mon,tue,wed,fri,sat'
    df['shift_days_of_week'] = df.shift_days_of_week.str.replace('/', ',')
    
    ## remove dashes
    booll = df.shift_days_of_week.str.contains('mon-wed')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('mon-wed','mon,tue,wed')
    booll = df.shift_days_of_week.str.contains('mon-fri')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('mon-fri','mon,tue,wed,thu,fri')
    booll = df.shift_days_of_week.str.contains('mon-thu')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('mon-thu','mon,tue,wed,thu')
    booll = df.shift_days_of_week.str.contains('mon-wed')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('mon-wed','mon,tue,wed')
    booll = df.shift_days_of_week.str.contains('tue-sat')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('tue-sat','tue,wed,thu,fri,sat')
    booll = df.shift_days_of_week.str.contains('tue-fri')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('tue-fri','tue,wed,thu,fri')
    booll = df.shift_days_of_week.str.contains('tue-thu')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('tue-thu','tue,wed,thu')
    booll = df.shift_days_of_week.str.contains('tue-wed')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('tue-wed','tue,wed')
    booll = df.shift_days_of_week.str.contains('wed-sun')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('wed-sun','wed,thu,fri,sat,sun')
    booll = df.shift_days_of_week.str.contains('wed-sat')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('wed-sat','wed,thu,fri,sat')
    booll = df.shift_days_of_week.str.contains('thu-mon')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('thu-mon','thu,fri,sat,sun,mon')
    booll = df.shift_days_of_week.str.contains('thu-sun')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('thu-sun','thu,fri,sat,sun')
    booll = df.shift_days_of_week.str.contains('thu-sat')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('thu-sat','thu,fri,sat')
    booll = df.shift_days_of_week.str.contains('fri-tue')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('fri-tue','fri,sat,sun,mon,tue')
    booll = df.shift_days_of_week.str.contains('fri-mon')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('fri-mon','fri,sat,sun,mon')
    booll = df.shift_days_of_week.str.contains('fri-sun')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('fri-sun','fri,sat,sun')
    booll = df.shift_days_of_week.str.contains('sat-wed')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('sat-wed','sat,sun,mon,tue,wed')
    booll = df.shift_days_of_week.str.contains('sat-tue')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('sat-tue','sat,sun,mon,tue')
    booll = df.shift_days_of_week.str.contains('sat-mon')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('sat-mon','sat,sun,mon')
    booll = df.shift_days_of_week.str.contains('sun-thu')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('sun-thu','sun,mon,tue,wed,thu')
    booll = df.shift_days_of_week.str.contains('sun-wed')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('sun-wed','sun,mon,tue,wed')
    booll = df.shift_days_of_week.str.contains('sun-tue')
    df.loc[booll, 'shift_days_of_week'] =  df[booll].shift_days_of_week.str.replace('sun-tue','sun,mon,tue')

###############################################################################
## Read Data
###############################################################################
print('Reading Data...')
df_train = pd.read_csv(PATH + 'WFS_Training.csv')
df_test  = pd.read_csv(PATH + 'WFS_TestFeatures.csv', encoding = "ISO-8859-1")

## Shuffle
df_train = df_train.sample(frac=1, random_state=2019)
df_train.reset_index(drop=True, inplace=True)

print(' Mean target:', np.mean(df_train[target]))
print(" Done", datetime.now())

###############################################################################
## Combine Data & FE
###############################################################################
print('FE...')

df_test[target]   = -1
df_test['train']  = -1
df_train['train'] = 0
## For doing Single Fold Experimentation
n = int(df_train.shape[0] * .8)
df_train.loc[0:n, 'train'] = 1

print(' Combine...')
df_test = df_test[df_train.columns]
df_test.reset_index(drop=True, inplace=True)

df = pd.concat([df_train, df_test], axis=0)
df.reset_index(drop=True, inplace=True)

print(' Add weather...')
weather = pd.read_csv(PATH +'weather.csv')
weather.loc[weather.prec=='T','prec'] = 0
weather['prec'] = weather.prec.astype(float)

weather.loc[weather.snow=='T','prec'] = 0
weather['snow'] = weather.prec.astype(float)

df['date'] = df.appt_1_date.str[0:10]
df = pd.merge( df, weather, how='left', on='date')
df.reset_index(drop=True, inplace=True)

df.drop('date', inplace=True, axis=1)

# Shift Start Time
df['sst'] = 0
booll = (~df.shift_start_time.isnull()) & (df.shift_start_time.str.contains('PM'))
df.loc[booll, 'sst'] = 12
for i in range(0,13):
    booll = (~df.shift_start_time.isnull()) & (df.shift_start_time.str.contains(str(i)+':'))
    df.loc[booll, 'sst'] = df.loc[booll, 'sst'] + i
booll = ~df.shift_start_time.isnull()
df.loc[booll, 'sst']  =  df[booll].sst + df[booll].shift_start_time.str[-5:-2].astype(int)/60
df.loc[df.shift_start_time.isnull(), 'sst'] = -1 #np.mean(df.loc[booll, 'sst'] )

## Shift End Time
df['set'] = 0
booll = (~df.shift_end_time.isnull()) & (df.shift_end_time.str.contains('PM'))
df.loc[booll, 'set'] = 12
for i in range(0,13):
    booll = (~df.shift_end_time.isnull()) & (df.shift_end_time.str.contains(str(i)+':'))
    df.loc[booll, 'set'] = df.loc[booll, 'set'] + i
booll = ~df.shift_end_time.isnull()
df.loc[booll, 'set']  =  df[booll].set + df[booll].shift_end_time.str[-5:-2].astype(int)/60
df.loc[df.shift_end_time.isnull(), 'set'] = -1 #np.mean(df.loc[booll, 'sst'] )

## Shift Total Time
df['stt']  = df['set'] + 24  - df['sst'] 
df.loc[df.stt>24,'stt'] = df[df.stt>24].stt - 24

## Date Fields
df['appt_1_date'] = df.appt_1_date.str.replace('Z', '')
df['appt_1_date'] = df.appt_1_date.str.replace('T', ' ')
df['appt_1_date'] = pd.to_datetime(df.appt_1_date,infer_datetime_format=True)
df['app_created_date'] = df.app_created_date.str.replace('Z', '')
df['app_created_date'] = df.app_created_date.str.replace('T', ' ')
df['app_created_date'] = pd.to_datetime(df.app_created_date,infer_datetime_format=True)

## Dang Eider and its out of date Pandas! >=o
try:
    df['appt_1_day_of_week']    = df['appt_1_date'].dt.day_name()
except:
    df['appt_1_day_of_week']    = df['appt_1_date'].dt.weekday_name

df['appt_1_hour']           = df['appt_1_date'].dt.hour
df['appt_1_week']           = df['appt_1_date'].dt.week

df['app_create_to_app_1']   = (df['app_created_date']-df['appt_1_date'])/ pd.offsets.Day(-1)

print(' Removing shift codes...')
narf1 = [f for f in df_train.shift_code.unique() if f not in df_test.shift_code.unique()]
narf2 = [f for f in df_test.shift_code.unique() if f not in df_train.shift_code.unique()]
narf3 = df_train.shift_code.value_counts().reset_index()
narf3 = list(narf3[narf3.shift_code<=3]['index'].unique())
df.loc[df.shift_code.isin( list(set( narf1 + narf2 + narf3 )) ), 'shift_code'] = -1

print(' Fixing shift_days_of_week...')
fix_shift_days_week(df)

## Add number of workdays
df['work_days'] = df.apply(lambda x: len(str(x.shift_days_of_week).split(',')),axis=1)

f='cand_education'
df.loc[df.cand_education=='AssociateÄôs / Trade School / Vocational',f] = 'Associate’s / Trade School / Vocational'
df.loc[df.cand_education=='Associateâs / Trade School / Vocational',f] = 'Associate’s / Trade School / Vocational'

print(' Turn cand_assess_overall_score into ordinal...')
f = 'cand_assess_overall_score'
df.loc[df[f]=='Highest'   , f] = 1
df.loc[df[f]=='High'      , f] = .75
df.loc[df[f]=='Moderate'  , f] = .5
df.loc[df[f]=='Low'       , f] = 0
df.loc[df[f]=='Ineligible', f] =-1
df.loc[df[f].isnull()     , f] =-1
df[f] = df[f].astype(int)

df['app_created_day']      = df['app_created_date'].dt.dayofyear
df['appt_1_day'     ]      = df['appt_1_date'     ].dt.dayofyear

narf = pd.DataFrame(df[df.train!=-1].groupby(by=['appt_1_day'])[target].count().reset_index())
narf.rename( {'nhenoshow_flag':'appt_1_day_apts'} , axis=1, inplace=True)
df = pd.merge( df, narf, how='left', on='appt_1_day')
df.reset_index(drop=True, inplace=True)

narf = pd.DataFrame(df[df.train!=-1].groupby(by=['appt_1_date'])[target].count().reset_index())
narf.rename( {'nhenoshow_flag':'appt_1_date_apts'} , axis=1, inplace=True)
df = pd.merge( df, narf, how='left', on='appt_1_date')
df.reset_index(drop=True, inplace=True)
df.appt_1_date_apts.fillna(0, inplace=True)

print(' One hot encoding app_esl_status...')
df_esl = pd.get_dummies(df.app_esl_status,dummy_na=True)
df_esl.columns= ['app_esl_status_ESL', 'app_esl_status_NonESL', 'app_esl_status_ESLNAN']
df = pd.concat( [df, df_esl], axis=1)
df.drop('app_esl_status', inplace=True, axis=1)
df.reset_index(drop=True, inplace=True)

print(' One hot encoding shift_schedule_type...')
f = 'shift_schedule_type'
df.loc[df[f]=='Flex Time (<19 hours)'     , f] = 'flex'
df.loc[df[f]=='Full-Time'                 , f] = 'full'
df.loc[df[f]=='Part-Time (20-29 hours)'   , f] = 'part'
df.loc[df[f]=='Reduced Time (30-39 hours)', f] = 'reduced'
df.loc[df[f].isnull()     , f] = 'Other'
df_sctype = pd.get_dummies(df.shift_schedule_type,dummy_na=False)
df_sctype.columns= ['shift_schedule_type_' + str(f) for f in list(df_sctype.columns)]
df = pd.concat( [df, df_sctype], axis=1)
df.drop('shift_schedule_type', inplace=True, axis=1)
df.reset_index(drop=True, inplace=True)

print(' Adding App-ID prefix...')
df['app_id'] = df.app_id.str.replace('App-','')
for i in range(1,8):
    df['app_id_' + str(i)] = df.app_id.str[i].astype(int)

print(' One hot encoding shift_startday...')
df_shift_startday = pd.get_dummies(df.shift_startday,dummy_na=False)
df_shift_startday.columns= ['shift_startday_' + str(f) for f in list(df_shift_startday.columns)]
df = pd.concat( [df, df_shift_startday], axis=1)
df.drop('shift_startday', inplace=True, axis=1)
df.reset_index(drop=True, inplace=True)

non_number_columns = df.dtypes[(df.dtypes == object) | (df.dtypes=='datetime64[ns]') | (df.dtypes=='timedelta64[ns]') ].index.values

for f in non_number_columns:
    print(f, df[f].value_counts().shape)

print(' Adding Events...')
dtemp = df.groupby(['cand_id','app_created_date','appt_1_date']).app_id.count().reset_index()
dtemp.rename(  {'app_id':'events'}, axis=1, inplace=True)
dtemp['events'] = 1/dtemp.events
df = pd.merge(df, dtemp, how='left', on=['cand_id','app_created_date','appt_1_date'])
df.reset_index(drop=True, inplace=True)

print(' Adding appt_1_date mean ...')
dtemp2 = df[df.train.isin([0,1])].groupby(['appt_1_date'])[target].mean().reset_index()
dtemp2.rename(  {'nhenoshow_flag':'appt_1_date_mean'}, axis=1, inplace=True)
df = pd.merge(df, dtemp2, how='left', on=['appt_1_date'])
df.reset_index(drop=True, inplace=True)


print(" Done", datetime.now())

# Numeric Feature Findings
* d_dep looks like it should be helpful
* events should be super helpful
* appt_1_date_mean also looks super helpful

# Categorical Feature Findings
* the events feature above was not discovered on its own, it was discovered by looking at the categorical feature plots
* cand_id notice the plot, its sorted with high values top and these are all very low target averages yet the average is .78 and mostly '1's
* Mean(target) with events == 1 is .996 while the Mean(target) events < 1 = .41!

In [0]:
###############################################################################
## Some Analasys
###############################################################################
features = [f for f in df.columns if f not in 
            ['ID', 'train',target,'app_id','cand_id','shift_end_time','shift_start_time'
            ,'appt_1_day', 'app_created_day'
            ,'app_created_hour']]

features_lr =     [f for f in features if f not in non_number_columns if f not in ['app_create_to_app_1']]
features_lr_pca = [f for f in features if f not in non_number_columns if f not in ['app_create_to_app_1']]
features_lr_no_w = [f for f in features_lr if f not in weather.columns]
number_columns = [f for f in features_lr if f not in non_number_columns]

for f in number_columns:
    if df[df.train>=0][f].value_counts(dropna=False).shape[0]>3:
        df_temp = df[df.train>=0][[f,target]].fillna(0)
        sns.distplot(df_temp[df_temp[target]==1][f],color='blue')
        sns.distplot(df_temp[df_temp[target]==0][f],color='red')
        #plt.savefig(PATH + 'histogram.' + str(f) + '.jpg')
        plt.show()
        plt.close()

for f in non_number_columns:
    plot_category_percent_of_target(df[df.train>=0], f)

print('Mean Target Events = 1: ', np.mean(df[  (df.train>=0) &(df.events==1) ][target]))
print('Mean Target Events < 1: ', np.mean(df[  (df.train>=0) &(df.events<1) ][target]))

In [0]:
###############################################################################
## Finish UP FE
###############################################################################

print(' Label Encoding non number columns...')
for column in non_number_columns:
    print('  ' + column)
    encoder = LabelEncoder().fit(df[column].astype(str))
    df[column] = encoder.transform(df[column].astype(str)).astype(np.int32)

print('FE done.')

###############################################################################
## Parameters
###############################################################################

n_components = 32
pca = PCA(n_components=n_components)
pca.fit(  np.nan_to_num(  df[ features_lr_pca ].values  )  ) 

params_cat = {}
params_cat['loss_function'] = 'MultiClass'
params_cat['random_seed'] =   2019
params_cat['classes_count'] = 2
params_cat['l2_leaf_reg']   = 3
params_cat['depth']         = 8
params_cat['learning_rate'] = 0.05
params_cat['iterations']    = 250
params_cat['verbose'] = False

params_lgb = {}
params_lgb['objective']        = 'multiclass'
params_lgb['max_depth']        = 7
params_lgb['num_leaves']       = 32
params_lgb['feature_fraction'] = 0.95
params_lgb['bagging_fraction'] = 0.8
params_lgb['bagging_freq']     = 1
params_lgb['learning_rate']    = 0.05
params_lgb['verbosity']        = 2
params_lgb['verbose']          = 2
params_lgb['num_class']        = 2
params_lgb['lambda']          = 0.1
params_lgb['alpha']           = 0.1
params_lgb['random_state']    = 2019

X_train = preprocessing.scale(  np.nan_to_num( df[df.train >= 0][ features_lr ].values )  )
Y_train = df[df.train >= 0][ target   ].values
X_test  = preprocessing.scale(  np.nan_to_num( df[df.train ==-1][ features_lr ].values )  )


EPOCHS  = 5

X_train2 = preprocessing.scale(  np.nan_to_num( df[df.train >= 0][ features_lr_no_w ].values )  )
X_test2  = preprocessing.scale(  np.nan_to_num( df[df.train ==-1][ features_lr_no_w ].values )  )

In [0]:
###############################################################################
## Base Model
###############################################################################
baseline = df[df.train>=0][['events',target]].copy()
baseline.fillna(np.mean(baseline.events), inplace=True)
print('\n  Baseline: ', 'AUC:', roc_auc_score( Y_train, baseline.events), 'ACC:', accuracy_score(Y_train, ( baseline.events > 0.5  ).astype(int) ))
print( confusion_matrix(Y_train, (  baseline.events > 0.5  ).astype(int)) )

In [0]:
###############################################################################
## Logistic Regression
###############################################################################
print('\n Logistic Regression...')
y_hat_lr = np.zeros(X_test.shape[0])
y_oof_lr = np.zeros(X_train.shape[0])
fold      = 1
kf      = KFold(n_splits = EPOCHS, shuffle = True, random_state=2019)

for tr_idx, val_idx in kf.split(X_train, Y_train):
    X_tr, X_vl = X_train[tr_idx], X_train[val_idx, :]
    y_tr, y_vl = Y_train[tr_idx], Y_train[val_idx]
    model_lr = LogisticRegression(random_state = 2019, C=1,tol=.0001).fit(X_tr, y_tr)
 
    y_pred_train = model_lr.predict_proba(X_vl)[:,1]
    y_oof_lr[val_idx] = y_pred_train
    
    y_zero = max(np.mean(y_vl), 1-np.mean(y_vl))
    ACC    = accuracy_score(y_vl, (y_pred_train > 0.5  ).astype(int) )
    AUC    = roc_auc_score( y_vl, y_pred_train)
    LIFT   = ( ACC - y_zero )*100
    print('  LR: ', 'AUC:', AUC, 'ACC:', ACC, 'LIFT:', LIFT)

    y_hat_lr+= model_lr.predict_proba(X_test)[:,1] / EPOCHS
    fold+=1

print('  LR AVG: ', 'AUC:', roc_auc_score( Y_train, y_oof_lr), 'ACC:', accuracy_score(Y_train, (y_oof_lr > 0.5  ).astype(int) ))
print( confusion_matrix(Y_train, (  y_oof_lr > 0.5  ).astype(int)) )

df_test[target]= y_hat_lr
df_test[['ID',target]].to_csv(PATH + 'sub.lr.' + 'folds' + str(EPOCHS) + '.csv', index = False, float_format = '%.4f')

print('  ', np.mean(y_hat_lr))

model_lr_full = LogisticRegression(random_state = 2019, C=1,tol=.0001).fit(X_train, Y_train)
y_hat_lr_full = model_lr_full.predict_proba(X_test)[:,1]

df_test[target]= y_hat_lr_full
df_test[['ID',target]].to_csv(PATH + 'sub.lr.' + 'full' + '.csv', index = False, float_format = '%.4f')

print('  ', np.mean(y_hat_lr_full))

feature_imp_lr = pd.DataFrame(sorted(zip(model_lr_full.coef_[0],features_lr)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data= feature_imp_lr.sort_values(by="Value", ascending=False))
plt.title('Logistic Regression Feature Importance')
plt.tight_layout()
plt.show()

In [0]:
print('----------------------------------------------------------------------')
from sklearn.model_selection import KFold

print('Final Models...')
print(' xgbboost...')
X_train = preprocessing.scale(  np.nan_to_num( df[df.train >= 0][ features_lr ].values )  )
Y_train = df[df.train >= 0][ target   ].values
X_test  = preprocessing.scale(  np.nan_to_num( df[df.train ==-1][ features_lr ].values )  )
EPOCHS  = 5

params_xgb = {}
params_xgb['eta']         = .037
params_xgb['seed']        = 2019
params_xgb['objective']   = 'multi:softprob'
params_xgb['num_class']   = 2
params_xgb['max_depth']   = 6
params_xgb['eval_metric'] = 'mlogloss'

nbr = 90

y_hat_xgb = np.zeros(X_test.shape[0])
y_oof_xgb = np.zeros(X_train.shape[0])
fold      = 1
kf      = KFold(n_splits = EPOCHS, shuffle = True, random_state=2019)
kvs = '.'.join([str(k) + '=' + str(v).replace(':','') for k,v in zip(list(params_xgb), [str(value) for value in params_xgb.values()])])
kvs += 'nbr=' + str(nbr)
for tr_idx, val_idx in kf.split(X_train, Y_train):
    postfix = 'epochs=' + str(EPOCHS) + 'fold=' + str(fold) + kvs
    filename = PATH + '/model_xgb.' + postfix + '.pkl'
    X_tr, X_vl = X_train[tr_idx], X_train[val_idx, :]
    y_tr, y_vl = Y_train[tr_idx], Y_train[val_idx]
    if os.path.isfile(filename) and 1==1:
        model_xgb = pickle.load(open(filename, 'rb'))
    else:
        model_xgb = xgb.train(params_xgb, xgb.DMatrix(X_tr, label=y_tr), num_boost_round=nbr)
        s = pickle.dump(model_xgb, open(filename,'wb'))
    y_pred_train = model_xgb.predict( xgb.DMatrix(X_vl, label=y_vl) )[:,1]
    y_oof_xgb[val_idx] = y_pred_train
    
    y_zero = max(np.mean(y_vl), 1-np.mean(y_vl))
    ACC    = accuracy_score(y_vl, (y_pred_train > 0.5  ).astype(int) )
    AUC    = roc_auc_score( y_vl, y_pred_train)
    LIFT   = ( ACC - y_zero )*100
    print('  xgb: ', 'AUC:', AUC, 'ACC:', ACC, 'LIFT:', LIFT)

    y_hat_xgb+= model_xgb.predict( xgb.DMatrix(X_test ))[:,1] / EPOCHS
    fold+=1

print('  xgb AVG: ', 'AUC:', roc_auc_score( Y_train, y_oof_xgb), 'ACC:', accuracy_score(Y_train, (y_oof_xgb > 0.5  ).astype(int) ))

In [0]:
d = model_xgb.get_fscore()
l = [  d['f' + str(i)] if 'f' + str(i) in d else 0  for i in range( len(features_lr)) ]
feature_imp_xgb = pd.DataFrame(sorted(zip(l,features_lr)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp_xgb.sort_values(by="Value", ascending=False))
plt.title('XGB Features (avg over folds)')
plt.tight_layout()
plt.show()

In [0]:
df_train['y_oof_xgb'] = y_oof_xgb
df_train[['ID','y_oof_xgb']].to_csv(PATH + 'y_hat_xgbw_train')

df_test[target]= y_hat_xgb
df_test[['ID',target]].to_csv(PATH + 'sub.xgbw.' + postfix + '.csv', index = False, float_format = '%.4f')
print('  ', np.mean(y_hat_xgb))

model_xgb = xgb.train(params_xgb, xgb.DMatrix(X_train, label=Y_train), num_boost_round=nbr)
y_hat_xgb = model_xgb.predict( xgb.DMatrix(X_test))[:,1]

df_test[target]= y_hat_xgb
df_test[['ID',target]].to_csv(PATH + 'sub.xgbw.' + 'full' + postfix + '.csv', index = False, float_format = '%.4f')
print('  ', np.mean(y_hat_xgb), params_xgb)

In [0]:
d = model_xgb.get_fscore()
l = [  d['f' + str(i)] if 'f' + str(i) in d else 0  for i in range( len(features_lr)) ]
feature_imp_xgb = pd.DataFrame(sorted(zip(l,features_lr)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp_xgb.sort_values(by="Value", ascending=False))
plt.title('XGB Feature Importance')
plt.tight_layout()
plt.show()

In [0]:
from sklearn.metrics import roc_curve
fpr_xgb, tpr_xgb, _ = roc_curve(Y_train,  y_oof_xgb)
plt.plot(fpr_xgb,tpr_xgb,label="XG Boost                , auc="+str(  round( roc_auc_score(Y_train, y_oof_xgb),4 )))
fpr_lr, tpr_lr, _ = roc_curve(Y_train,  y_oof_lr)
plt.plot(fpr_lr,tpr_lr,label="Logistic Regression, auc="+str(  round( roc_auc_score(Y_train, y_oof_lr),4 )))
plt.legend(loc=4)
plt.show()

In [0]:
features_mpa = ['cand_assess_overall_score', 'commute_distance', 'shift_duration', 'shift_start_time_group'
, 'sst', 'set', 'stt', 'appt_1_hour', 'appt_1_week', 'app_create_to_app_1'
, 'work_days', 'appt_1_day_apts', 'appt_1_date_apts', 'app_esl_status_ESL', 'app_esl_status_NonESL', 'app_esl_status_ESLNAN'
, 'shift_schedule_type_Other', 'shift_schedule_type_flex', 'shift_schedule_type_full', 'shift_schedule_type_part'
, 'shift_schedule_type_reduced', 'app_id_1', 'app_id_2', 'app_id_3', 'app_id_4', 'app_id_5', 'app_id_6', 'app_id_7'
, 'shift_startday_Fri', 'shift_startday_Mon', 'shift_startday_Sat', 'shift_startday_Sun', 'shift_startday_Thu', 'shift_startday_Tue', 'shift_startday_Wed', 'shift_startday_You'
, 'events']
 
X_train3 = preprocessing.scale(  np.nan_to_num( df[df.train >= 0][ features_mpa ].values )  )
X_test3   = preprocessing.scale(  np.nan_to_num( df[df.train ==-1][ features_mpa ].values )  )

In [0]:
###############################################################################
## NB
###############################################################################
print('\n Naive Bayes...')
y_hat_nb = np.zeros(X_test3.shape[0])
y_oof_nb = np.zeros(X_train3.shape[0])
kf      = KFold(n_splits = EPOCHS, shuffle = True, random_state=2019)
fold      = 1
for tr_idx, val_idx in kf.split(X_train3, Y_train):
    X_tr, X_vl = X_train3[tr_idx], X_train3[val_idx, :]
    y_tr, y_vl = Y_train[tr_idx], Y_train[val_idx]
    model_nb = GaussianNB().fit(X_tr, y_tr)
 
    y_pred_train = model_nb.predict_proba(X_vl)[:,1]
    y_oof_nb[val_idx] = y_pred_train
    
    y_zero = max(np.mean(y_vl), 1-np.mean(y_vl))
    ACC    = accuracy_score(y_vl, (y_pred_train > 0.5  ).astype(int) )
    AUC    = roc_auc_score( y_vl, y_pred_train)
    LIFT   = ( ACC - y_zero )*100
    print('  NB: ', 'AUC:', AUC, 'ACC:', ACC, 'LIFT:', LIFT)

    y_hat_nb+= model_nb.predict_proba(X_test3)[:,1] / EPOCHS
    fold+=1

print('  NB AVG: ', 'AUC:', roc_auc_score( Y_train, y_oof_nb), 'ACC:', accuracy_score(Y_train, (y_oof_nb > 0.5  ).astype(int) ))
print( confusion_matrix(Y_train, (  y_oof_nb > 0.5  ).astype(int)) )

df_test[target]= y_hat_nb
df_test[['ID',target]].to_csv(PATH + 'sub.nb.' + 'folds' + str(EPOCHS) + '.csv', index = False, float_format = '%.4f')

print('  ', np.mean(y_hat_nb))

model_nb_full = GaussianNB().fit(X_train3, Y_train)
y_hat_nb_full = model_nb_full.predict_proba(X_test3)[:,1]

df_test[target]= y_hat_nb_full
df_test[['ID',target]].to_csv(PATH + 'sub.nb.' + 'full' + '.csv', index = False, float_format = '%.4f')

print('  ', np.mean(y_hat_nb_full))

In [0]:
###############################################################################
## MLP
###############################################################################

print('\n MLP...')
y_hat_mlp = np.zeros(X_test3.shape[0])
y_oof_mlp = np.zeros(X_train3.shape[0])
kf      = KFold(n_splits = EPOCHS, shuffle = True, random_state=2019)

fold      = 1
for tr_idx, val_idx in kf.split(X_train3, Y_train):
    X_tr, X_vl = X_train3[tr_idx], X_train3[val_idx, :]
    y_tr, y_vl = Y_train[tr_idx], Y_train[val_idx]
    model_mlp = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(5, 2), random_state=2019).fit(X_tr, y_tr)
    y_pred_train = model_mlp.predict_proba(X_vl)[:,1]
    y_oof_mlp[val_idx] = y_pred_train
    y_zero = max(np.mean(y_vl), 1-np.mean(y_vl))
    ACC    = accuracy_score(y_vl, (y_pred_train > 0.5  ).astype(int) )
    AUC    = roc_auc_score( y_vl, y_pred_train)
    LIFT   = ( ACC - y_zero )*100
    print('  MLP: ', 'AUC:', AUC, 'ACC:', ACC, 'LIFT:', LIFT)
    y_hat_mlp+= model_mlp.predict_proba(X_test3)[:,1] / EPOCHS
    fold+=1

print('  MLP AVG: ', 'AUC:', roc_auc_score( Y_train, y_oof_mlp), 'ACC:', accuracy_score(Y_train, (y_oof_mlp > 0.5  ).astype(int) ))
print( confusion_matrix(Y_train, (  y_oof_mlp > 0.5  ).astype(int)) )

df_test[target]= y_hat_mlp
df_test[['ID',target]].to_csv(PATH + 'sub.mlp.' + 'folds' + str(EPOCHS) + '.csv', index = False, float_format = '%.4f')

print('  ', np.mean(y_hat_mlp))

model_mlp_full = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(5, 2), random_state=2019).fit(X_train3, Y_train)
y_hat_mlp_full = model_mlp_full.predict_proba(X_test3)[:,1]

df_test[target]= y_hat_mlp_full
df_test[['ID',target]].to_csv(PATH + 'sub.mlp.' + 'full' + '.csv', index = False, float_format = '%.4f')

print('  ', np.mean(y_hat_mlp_full))

In [0]:
###############################################################################
## MLP ADAM
###############################################################################

params_mlp = {}
params_mlp['random_state']       = 2019
params_mlp['max_iter']           = 200
params_mlp['hidden_layer_sizes'] = (5,2)
params_mlp['alpha']              = 0.00025
params_mlp['solver']             = 'adam'
params_mlp['epsilon']            = 2e-4
params_mlp['activation']         = 'relu'
params_mlp['beta_1']             = .91
params_mlp['beta_2']             = .99


print('\n  MLP ADAM...')
y_hat_mlpa = np.zeros(X_test3.shape[0])
y_oof_mlpa = np.zeros(X_train3.shape[0])
kf      = KFold(n_splits = EPOCHS, shuffle = True, random_state=2019)
fold      = 1
for tr_idx, val_idx in kf.split(X_train3, Y_train):
    X_tr, X_vl = X_train3[tr_idx], X_train3[val_idx, :]
    y_tr, y_vl = Y_train[tr_idx], Y_train[val_idx]
    model_mlpa = MLPClassifier(**params_mlp).fit(X_tr, y_tr)
    y_pred_train = model_mlpa.predict_proba(X_vl)[:,1]
    y_oof_mlpa[val_idx] = y_pred_train
    y_zero = max(np.mean(y_vl), 1-np.mean(y_vl))
    ACC    = accuracy_score(y_vl, (y_pred_train > 0.5  ).astype(int) )
    AUC    = roc_auc_score( y_vl, y_pred_train)
    LIFT   = ( ACC - y_zero )*100
    print('  MLP ADAM: ', 'AUC:', AUC, 'ACC:', ACC, 'LIFT:', LIFT)
    y_hat_mlpa+= model_mlpa.predict_proba(X_test3)[:,1] / EPOCHS
    fold+=1

print('  MLP AVG ADAM: ', 'AUC:', roc_auc_score( Y_train, y_oof_mlpa), 'ACC:', accuracy_score(Y_train, (y_oof_mlpa > 0.5  ).astype(int) ))
print( confusion_matrix(Y_train, (  y_oof_mlpa > 0.5  ).astype(int)) )

df_test[target]= y_hat_mlpa
df_test[['ID',target]].to_csv(PATH + 'sub.mlpa.' + 'folds' + str(EPOCHS) + '.csv', index = False, float_format = '%.4f')

print('  ', np.mean(y_hat_mlpa))

model_mlpa_full = MLPClassifier(**params_mlp).fit(X_train3, Y_train)
y_hat_mlpa_full = model_mlpa_full.predict_proba(X_test3)[:,1]

df_test[target]= y_hat_mlpa_full
df_test[['ID',target]].to_csv(PATH + 'sub.mlpa.' + 'full' + '.csv', index = False, float_format = '%.4f')

print('  ', np.mean(y_hat_mlpa_full))

In [0]:
###############################################################################
## LGB
###############################################################################
params_lgb['lambda']          = 0.10
params_lgb['max_depth']       = 3

print('\n lgbboost...')
y_hat_lgb = np.zeros(X_test.shape[0])
y_oof_lgb = np.zeros(X_train.shape[0])
fold      = 1
kf      = KFold(n_splits = EPOCHS, shuffle = True, random_state=2019)
kvs = '.'.join([str(k) + '=' + str(v).replace(':','') for k,v in zip(list(params_lgb), [str(value) for value in params_lgb.values()])])
nbr_lgb = 300

for tr_idx, val_idx in kf.split(X_train, Y_train):
    postfix = 'epochs=' + str(EPOCHS) + 'fold=' + str(fold) + kvs
    filename = PATH + '/pickles/model_lgb.' + postfix + '.pkl'
    X_tr, X_vl = X_train[tr_idx], X_train[val_idx, :]
    y_tr, y_vl = Y_train[tr_idx], Y_train[val_idx]
    if os.path.isfile(filename):
        model_lgb = pickle.load(open(filename, 'rb'))
    else:
        model_lgb = lgb.train(params_lgb, lgb.Dataset(X_tr, label = y_tr), num_boost_round=nbr_lgb)
        s = pickle.dump(model_lgb, open(filename,'wb'))
    y_pred_train = model_lgb.predict(X_vl)[:,1]
    y_oof_lgb[val_idx] = y_pred_train
    
    y_zero = max(np.mean(y_vl), 1-np.mean(y_vl))
    ACC    = accuracy_score(y_vl, (y_pred_train > 0.5  ).astype(int) )
    AUC    = roc_auc_score( y_vl, y_pred_train)
    LIFT   = ( ACC - y_zero )*100
    print('  lgb: ', 'AUC:', AUC, 'ACC:', ACC, 'LIFT:', LIFT)

    y_hat_lgb+= model_lgb.predict(X_test)[:,1] / EPOCHS
    fold+=1

print('  lgb AVG: ', 'AUC:', roc_auc_score( Y_train, y_oof_lgb), 'ACC:', accuracy_score(Y_train, (y_oof_lgb > 0.5  ).astype(int) ))
print( confusion_matrix(Y_train, (  y_oof_lgb > 0.5  ).astype(int)) )

df_test[target]= y_hat_lgb
df_test[['ID',target]].to_csv(PATH + 'sub.lgb.' + postfix + '.csv', index = False, float_format = '%.4f')

print('  ', np.mean(y_hat_lgb))

model_lgb_full = lgb.train(params_lgb, lgb.Dataset(X_train, label = Y_train), num_boost_round=nbr_lgb)
y_hat_lgb_full = model_lgb_full.predict(X_test)[:,1]


df_test[target]= y_hat_lgb_full
df_test[['ID',target]].to_csv(PATH + 'sub.lgb.' + 'full' + '.csv', index = False, float_format = '%.4f')


print('  ', np.mean(y_hat_lgb_full))

feature_imp = pd.DataFrame(sorted(zip(model_lgb.feature_importance(),features_lr)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

In [0]:
###############################################################################
## CatBoost
###############################################################################
print(' Catboost...')
params_cat = {}
params_cat['loss_function'] = 'MultiClass'
params_cat['random_seed'] =   2019
params_cat['classes_count'] = 2
params_cat['l2_leaf_reg']   = 3
params_cat['depth']         = 6
params_cat['learning_rate'] = 0.043
params_cat['iterations']    = 250
params_cat['verbose'] = False

y_hat_cat = np.zeros(X_test.shape[0])
y_oof_cat = np.zeros(X_train.shape[0])
fold      = 1
kf      = KFold(n_splits = EPOCHS, shuffle = True, random_state=2019)

kvs = '.'.join([str(k) + '=' + str(v).replace(':','') for k,v in zip(list(params_cat), [str(value) for value in params_cat.values()])])

for tr_idx, val_idx in kf.split(X_train, Y_train):
    postfix = 'epochs=' + str(EPOCHS) + 'fold=' + str(fold) + kvs
    filename = PATH + '/pickles/model_cat.' + postfix + '.pkl'
    X_tr, X_vl = X_train[tr_idx], X_train[val_idx, :]
    y_tr, y_vl = Y_train[tr_idx], Y_train[val_idx]
    if os.path.isfile(filename):
        model_cat = pickle.load(open(filename, 'rb'))
    else:
        model_cat = CatBoostClassifier(**params_cat).fit(X_tr, y_tr)
        s = pickle.dump(model_cat, open(filename,'wb'))
    y_pred_train = model_cat.predict_proba(X_vl)[:,1]
    y_oof_cat[val_idx] = y_pred_train
    
    y_zero = max(np.mean(y_vl), 1-np.mean(y_vl))
    ACC    = accuracy_score(y_vl, (y_pred_train > 0.5  ).astype(int) )
    AUC    = roc_auc_score( y_vl, y_pred_train)
    LIFT   = ( ACC - y_zero )*100
    print('  CAT: ', 'AUC:', AUC, 'ACC:', ACC, 'LIFT:', LIFT)

    y_hat_cat+= model_cat.predict_proba(X_test)[:,1] / EPOCHS
    fold+=1

print('  CAT AVG: ', 'AUC:', roc_auc_score( Y_train, y_oof_cat), 'ACC:', accuracy_score(Y_train, (y_oof_cat > 0.5  ).astype(int) ))
print( confusion_matrix(Y_train, (  y_oof_cat > 0.5  ).astype(int)) )

df_test[target]= y_hat_cat
df_test[['ID',target]].to_csv(PATH + 'sub.cat.' + postfix + '.csv', index = False, float_format = '%.4f')

print('  ', np.mean(y_hat_cat))

model_cat_full = CatBoostClassifier(**params_cat).fit(X_train, Y_train)
y_hat_cat_full = model_cat_full.predict_proba(X_test)[:,1]

df_test[target]= y_hat_cat_full
df_test[['ID',target]].to_csv(PATH + 'sub.cat.' + 'full' + '.csv', index = False, float_format = '%.4f')


print('  ', np.mean(y_hat_cat_full))

feature_imp = pd.DataFrame(sorted(zip(model_cat.get_feature_importance(),features_lr)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()


In [0]:
y_hat_ens_oof = (y_oof_cat**2      + y_oof_xgb**4 + y_oof_mlp**4       + y_oof_mlpa**4      )/(4)
y_hat_ens     = (y_hat_cat_full**2 + y_hat_xgb**4 + y_hat_mlp_full**4  + y_hat_mlpa_full**4 )/(4)
print('  ENS AVG: ', 'AUC:', roc_auc_score( Y_train, y_hat_ens_oof), 'ACC:', accuracy_score(Y_train, (y_hat_ens_oof > 0.5  ).astype(int) ))
print( confusion_matrix(Y_train, (  y_hat_ens_oof > 0.2  ).astype(int)) )
print('ENS: ', np.mean(y_hat_ens))

narf = df_train.cand_id.value_counts().reset_index()
narf.columns = ['cand_id', 'events']
v1 = np.mean(df_train[df_train.cand_id.isin( narf[narf.events==1].cand_id   )][target])
v2 = np.mean(df_train[df_train.cand_id.isin( narf[narf.events>1].cand_id   )][target])

narf = pd.concat(  [ df_train[['ID', 'cand_id']], df_test[['ID', 'cand_id']] ], axis=0 ).cand_id.value_counts().reset_index()
narf.columns = ['cand_id', 'events']

df_test['narf'] = v1
df_test.loc[ df_test.cand_id.isin( narf[narf.events>1].cand_id)  , 'narf'] = v2

###############################################################################
## Stacking
###############################################################################
df_train['y_hat_lr']    = y_oof_lr**2
df_train['y_hat_nb']    = y_oof_nb**2
df_train['y_hat_mlp']   = y_oof_mlp**2
df_train['y_hat_mlpa']  = y_oof_mlpa**2
df_train['y_hat_lgb']   = y_oof_lgb**2
df_train['y_hat_cat']   = y_oof_cat
df_train['y_hat_xgb']   = y_oof_xgb**2

df_test['y_hat_lr']    = y_hat_lr_full**2
df_test['y_hat_nb']    = y_hat_nb_full**2
df_test['y_hat_mlp']   = y_hat_mlp_full**2
df_test['y_hat_mlpa']  = y_hat_mlpa_full**2
df_test['y_hat_lgb']   = y_hat_lgb_full**2
df_test['y_hat_cat']   = y_hat_cat
df_test['y_hat_xgb']   = y_hat_xgb**2

cols = ['y_hat_lr', 'y_hat_mlp', 'y_hat_mlpa', 'y_hat_cat', 'y_hat_xgb']
x_train = df_train[cols]
x_test  = df_test[cols]

model_rf = RandomForestClassifier(n_estimators=300, max_depth = 4, random_state = 2019).fit( x_train, Y_train)
y_hat_rf = model_rf.predict_proba( x_test )[:,1]

###############################################################################
## Write Outputs
###############################################################################
df_test[target]= y_hat_rf
df_test[['ID',target]].to_csv(PATH + 'submission.stack.{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index = False, float_format = '%.4f')


df_test[target]= y_hat_ens
df_test[['ID',target]].to_csv(PATH + 'submission.ens.{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index = False, float_format = '%.4f')

df_test[target]= df_test.narf *0.1 + y_hat_ens *0.9
print(np.mean(df_test[target]))
df_test[['ID',target]].to_csv(PATH + 'submission.narf.{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index = False, float_format = '%.4f')

## Save Python To File
os.system("copy " + PATH.replace('/','\\') + "wfs_fin.py " + PATH.replace('/','\\') + "wfs_fin.py.{}".format(datetime.now().strftime('%Y%m%d_%H%M%S')))


In [0]:
for i in [1,2,4,8,16,32,64]:
    y_hat_ens_pow = (y_oof_xgb**i + y_oof_mlp**i + y_oof_mlpa**i)/(34)
    fpr, tpr, _ = roc_curve(Y_train,  y_hat_ens_pow)
    auc = roc_auc_score(Y_train, y_hat_ens_pow)
    plt.plot(fpr,tpr,label='Power ' + str(i) + ', auc='+str(round( auc, 4)))
    plt.legend(loc=4)