In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

from multiprocessing import *
import warnings
warnings.filterwarnings("ignore")

import time
import gc


from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score

# Regularized Greedy Forest
from rgf.sklearn import RGFClassifier     # https://github.com/fukatani/rgf_python

In [46]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)


In [47]:
#### Load Data
train = pd.read_csv('/Users/siero5335/Desktop/Safe Driver Prediction/train.csv')
test = pd.read_csv('/Users/siero5335/Desktop/Safe Driver Prediction/test.csv')


id_test = test['id'].values
target_train = train['target'].values


train.drop(['id','target'],axis=1,inplace=True)
test.drop(['id'],axis=1,inplace=True)

train['ps_ind_0609_bin'] = train.apply(lambda x: 1 if x['ps_ind_06_bin'] == 1 else (2 if x['ps_ind_07_bin'] == 1 else 
(
3 if x['ps_ind_08_bin'] == 1 else (4 if x['ps_ind_09_bin'] == 1 else 5)

)), axis = 1)

test['ps_ind_0609_bin'] = test.apply(lambda x: 1 if x['ps_ind_06_bin'] == 1 else (2 if x['ps_ind_07_bin'] == 1 else 
(
3 if x['ps_ind_08_bin'] == 1 else (4 if x['ps_ind_09_bin'] == 1 else 5)

)), axis = 1)

train.drop(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin'], axis = 1, inplace = True)

test.drop(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin'], axis = 1, inplace = True)

train['ps_car_13'] = (train['ps_car_13']*train['ps_car_13']* 48400).round(0)

test['ps_car_13'] = (test['ps_car_13']*test['ps_car_13']* 48400).round(0)

train['ps_car_12'] = (train['ps_car_12']*train['ps_car_12']).round(4) * 10000

test['ps_car_12'] = (test['ps_car_12']*test['ps_car_12']).round(4) * 10000

for c in train[[c for c in train.columns if 'bin' in c]].columns:
    for cc in train[[c for c in train.columns if 'bin' in c]].columns:
            if train[train[cc] * train[c] == 0].shape[0] == train.shape[0]:
                print(c, cc)

train['ps_ind_161718_bin'] = train.apply(lambda x: 1 if x['ps_ind_16_bin'] == 1 else
                                        (2 if x['ps_ind_17_bin'] == 1 else 3), axis = 1
                                        )

test['ps_ind_161718_bin'] = test.apply(lambda x: 1 if x['ps_ind_16_bin'] == 1 else
                                        (2 if x['ps_ind_17_bin'] == 1 else 3), axis = 1
                                        )

train.drop(['ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'], axis = 1, inplace = True)

test.drop(['ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'], axis = 1, inplace = True)

ps_ind_16_bin ps_ind_17_bin
ps_ind_16_bin ps_ind_18_bin
ps_ind_17_bin ps_ind_16_bin
ps_ind_17_bin ps_ind_18_bin
ps_ind_18_bin ps_ind_16_bin
ps_ind_18_bin ps_ind_17_bin


In [48]:
### Great Recovery from Pascal's materpiece

def recon(reg):
    integer = int(np.round((40*reg)**2)) 
    for a in range(32):
        if (integer - a) % 31 == 0:
            A = a
    M = (integer - A)//31
    return A, M
train['ps_reg_A'] = train['ps_reg_03'].apply(lambda x: recon(x)[0])
train['ps_reg_M'] = train['ps_reg_03'].apply(lambda x: recon(x)[1])
train['ps_reg_A'].replace(19,-1, inplace=True)
train['ps_reg_M'].replace(51,-1, inplace=True)
test['ps_reg_A'] = test['ps_reg_03'].apply(lambda x: recon(x)[0])
test['ps_reg_M'] = test['ps_reg_03'].apply(lambda x: recon(x)[1])
test['ps_reg_A'].replace(19,-1, inplace=True)
test['ps_reg_M'].replace(51,-1, inplace=True)

train['ps_car_13_x_ps_reg_03'] = train['ps_car_13'] * train['ps_reg_03']
test['ps_car_13_x_ps_reg_03'] = test['ps_car_13'] * test['ps_reg_03']

# Columns -> binary decoded.

tmp  = train['ps_calc_15_bin'] * 32 + train['ps_calc_16_bin'] * 16 + train['ps_calc_17_bin'] * 8
tmp += train['ps_calc_18_bin'] * 4 + train['ps_calc_19_bin'] * 2 + train['ps_calc_20_bin'] * 1

tmp2 = [5, 22, 9, 32, 13, 38, 20, 47, 2, 19, 8, 30, 10, 35, 17, 45, 1,
        15, 4, 24, 7, 29, 14, 40, 0, 12, 3, 21, 6, 26, 11, 36, 27, 52,
        37, 57, 42, 60, 51, 63, 23, 49, 34, 56, 39, 59, 48, 62, 18, 46,
        28, 53, 33, 55, 44, 61, 16, 43, 25, 50, 31, 54, 41, 58]
tmp2 = pd.Series(tmp2)

train['ps_calc_15_16_17_18_19_20'] = tmp.map(tmp2)


tmp3  = test['ps_calc_15_bin'] * 32 + test['ps_calc_16_bin'] * 16 + test['ps_calc_17_bin'] * 8
tmp3 += test['ps_calc_18_bin'] * 4 + test['ps_calc_19_bin'] * 2 + test['ps_calc_20_bin'] * 1

tmp4 = [5, 22, 9, 32, 13, 38, 20, 47, 2, 19, 8, 30, 10, 35, 17, 45, 1,
        15, 4, 24, 7, 29, 14, 40, 0, 12, 3, 21, 6, 26, 11, 36, 27, 52,
        37, 57, 42, 60, 51, 63, 23, 49, 34, 56, 39, 59, 48, 62, 18, 46,
        28, 53, 33, 55, 44, 61, 16, 43, 25, 50, 31, 54, 41, 58]
tmp4 = pd.Series(tmp4)

test['ps_calc_15_16_17_18_19_20'] = tmp3.map(tmp4)

In [49]:
# from olivier
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
	"ps_reg_03",  #            : 1408.42 / shadow  511.15
	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
	"ps_ind_03",  #            : 1219.47 / shadow  230.55
	"ps_ind_15",  #            :  922.18 / shadow  242.00
	"ps_reg_02",  #            :  920.65 / shadow  267.50
	"ps_car_14",  #            :  798.48 / shadow  549.58
	"ps_car_12",  #            :  731.93 / shadow  293.62
	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
	"ps_reg_01",  #            :  598.60 / shadow  178.57
	"ps_car_15",  #            :  593.35 / shadow  226.43
	"ps_ind_01",  #            :  547.32 / shadow  154.58
	"ps_ind_161718_bin",  #        :  475.37 / shadow   34.17
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_0609_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
	"ps_reg_M",  #
	"ps_car_13_x_ps_reg_03",  
    "ps_calc_15_16_17_18_19_20" #  
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

In [50]:
# Process data
start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train[name1] = train[f1].apply(lambda x: str(x)) + "_" + train[f2].apply(lambda x: str(x))
    test[name1] = test[f1].apply(lambda x: str(x)) + "_" + test[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train[name1].values) + list(test[name1].values))
    train[name1] = lbl.transform(list(train[name1].values))
    test[name1] = lbl.transform(list(test[name1].values))

    train_features.append(name1)
    
train = train[train_features]
test = test[train_features]

cat_features = [a for a in train.columns if a.endswith('cat')]

for column in cat_features:
	temp = pd.get_dummies(pd.Series(train[column]))
	train = pd.concat([train,temp],axis=1)
	train = train.drop([column],axis=1)
    
for column in cat_features:
	temp = pd.get_dummies(pd.Series(test[column]))
	test = pd.concat([test,temp],axis=1)
	test = test.drop([column],axis=1)


print(train.values.shape, test.values.shape)

current feature                                 ps_reg_01_plus_ps_car_02_cat    1 in   0.0current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0

KeyError: "['ps_ind_18_bin'] not in index"

In [51]:
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2017).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                cross_score = cross_val_score(clf, X_train, y_train, cv=self.n_splits, scoring='roc_auc')
                print("    cross_score: %.5f" % (cross_score.mean()))
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)

            print("     Model score: %.5f\n" % roc_auc_score(y, S_train[:,i]))

        self.base_preds = S_test
        
        # Log odds transformation
        almost_zero = 1e-12
        almost_one = 1 - almost_zero  # To avoid division by zero
        S_train[S_train>almost_one] = almost_one
        S_train[S_train<almost_zero] = almost_zero
        S_train = np.log(S_train/(1-S_train))
        S_test[S_test>almost_one] = almost_one
        S_test[S_test<almost_zero] = almost_zero
        S_test = np.log(S_test/(1-S_test))
        
        results = cross_val_score(self.stacker, S_train, y, cv=self.n_splits, scoring='roc_auc')
        print("Stacker score: %.5f" % (results.mean()))

        self.stacker.fit(S_train, y)
        print( 'Coefficients:', self.stacker.coef_ )

        res = self.stacker.predict_proba(S_test)[:,1]
        return res

In [52]:
# LightGBM params
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 650
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['random_state'] = 99


lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3   
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16
lgb_params2['random_state'] = 114514


lgb_params3 = {}
lgb_params3['n_estimators'] = 1100
lgb_params3['max_depth'] = 4
lgb_params3['learning_rate'] = 0.02
lgb_params3['random_state'] = 1145141919


# XGBoost params
xgb_params = {}
xgb_params['objective'] = 'binary:logistic'
xgb_params['learning_rate'] = 0.04
xgb_params['n_estimators'] = 490
xgb_params['max_depth'] = 4
xgb_params['subsample'] = 0.8
xgb_params['colsample_bytree'] = 0.8 
xgb_params['min_child_weight'] = 0.77
xgb_params['reg_alpha'] = 8
xgb_params['seed'] = 71
xgb_params['gamma'] = 10
xgb_params['reg_lambda'] = 1.3

# Regularized Greedy Forest params
rgf_params = {}
rgf_params['max_leaf'] = 2000
rgf_params['learning_rate'] = 0.5
rgf_params['algorithm'] = "RGF_Sib"
rgf_params['test_interval'] = 100
rgf_params['min_samples_leaf'] = 3 
rgf_params['reg_depth'] = 1.0
rgf_params['l2'] = 0.5  
rgf_params['sl2'] = 0.005

In [53]:
lgb_model = LGBMClassifier(**lgb_params)

lgb_model2 = LGBMClassifier(**lgb_params2)

lgb_model3 = LGBMClassifier(**lgb_params3)

lgb_model4 = LGBMClassifier(**lgb_params4)

xgb_model = XGBClassifier(**xgb_params)

rgf_model = RGFClassifier(**rgf_params)

log_model = LogisticRegression(fit_intercept=False)

In [54]:
log_model = LogisticRegression(fit_intercept=False)

np.random.seed(42)

stack = Ensemble(n_splits=3,
        stacker = log_model,
        base_models = (lgb_model, lgb_model2, lgb_model3, lgb_model4, xgb_model, rgf_model))        
        
y_pred = stack.fit_predict(train, target_train, test)        


sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_pred
sub.to_csv('stacked_6.csv', index=False)

Fit LGBMClassifier fold 1
    cross_score: 0.63752
Fit LGBMClassifier fold 2
    cross_score: 0.63490
Fit LGBMClassifier fold 3
    cross_score: 0.63714
     Model score: 0.63968

Fit LGBMClassifier fold 1
    cross_score: 0.63717
Fit LGBMClassifier fold 2
    cross_score: 0.63712
Fit LGBMClassifier fold 3
    cross_score: 0.63681
     Model score: 0.63929

Fit LGBMClassifier fold 1
    cross_score: 0.63604
Fit LGBMClassifier fold 2
    cross_score: 0.63452
Fit LGBMClassifier fold 3
    cross_score: 0.63481
     Model score: 0.63893

Fit LGBMClassifier fold 1
    cross_score: 0.55933
Fit LGBMClassifier fold 2
    cross_score: 0.56602
Fit LGBMClassifier fold 3
    cross_score: 0.56145
     Model score: 0.57152

Fit XGBClassifier fold 1
    cross_score: 0.63720
Fit XGBClassifier fold 2
    cross_score: 0.63598
Fit XGBClassifier fold 3
    cross_score: 0.63588
     Model score: 0.63834

Fit RGFClassifier fold 1
    cross_score: 0.63224
Fit RGFClassifier fold 2
    cross_score: 0.63139
Fit