## Porto Seguro’s Safe Driver Prediction : LightGBM with stacking


From : https://www.kaggle.com/yekenot/simple-stacker-lb-0-284   ---  very helpful!



In [None]:
import time
import gc
import os
import numpy as np
import pandas as pd

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit

In [None]:
N_SPLITS = 4

## Preprocessing

Load Data

In [None]:
src_data_path = "../input/porto-seguro-safe-driver-prediction"
df_train = pd.read_csv(os.path.join(src_data_path, "train.csv"))
df_test = pd.read_csv(os.path.join(src_data_path, "test.csv"))

In [None]:
id_test = df_test['id'].values
target_train = df_train['target'].values

df_train = df_train.drop(['target', 'id'], axis=1)
df_test = df_test.drop(['id'], axis=1)

Check missing data(-1 means data is missing.)

In [None]:
import missingno as msno
df_copy = df_train.copy()
df_copy = df_copy.replace(-1, np.nan)

# print in order of number of NaN.
print(df_copy.count().sort_values()[0:5])
msno.matrix(df=df_copy.iloc[:, 2:39], figsize=(20, 14), color=(0.42, 0.1, 0.05))

Remove columns data is too much missing.

In [None]:
incomplete_columns = ['ps_car_03_cat', 'ps_car_05_cat']
df_train = df_train.drop(incomplete_columns, axis=1)
df_test = df_test.drop(incomplete_columns, axis=1)

In [None]:
df_train = df_train.replace(-1, np.nan)
df_test = df_test.replace(-1, np.nan)

Do one-hot encoding conversion on each categorical features(features name ends with 'cat')

In [None]:
cat_features = [a for a in df_train.columns if a.endswith('cat')]

def preprocess_cat_feature(df, cat_feats):
    for column in cat_feats:
        temp = pd.get_dummies(pd.Series(df[column]), prefix=column)        
        df = pd.concat([df, temp],axis=1)
        df = df.drop([column],axis=1)
    return df

df_train = preprocess_cat_feature(df_train, cat_features)
df_test = preprocess_cat_feature(df_test, cat_features)

Calculate feature importance and drop less-important features.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# check feature importances
model = lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.1, max_depth=-1, min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1.0)

X = df_train
y = target_train
X = X.replace(-1, np.nan)
model.fit(X, y)

features_imp = pd.DataFrame(sorted(zip(model.feature_importances_, X.columns)), columns=["Value", "Feature"])
plt.figure(figsize=(16, 50))
sns.barplot(x="Value", y="Feature", data=features_imp.sort_values(by="Value", ascending=False))

In [None]:
sorted_columns_by_fi = sorted(zip(model.feature_importances_, X.columns))
col_to_drop = [x[1] for x in sorted_columns_by_fi if x[0] < 100]

df_train = df_train.drop(col_to_drop, axis=1)
df_test = df_test.drop(col_to_drop, axis=1)

Most of other notebooks drop columns starts with 'ps_calc_' and get good result.<br>
I don't know why dropping 'ps_calc_XXX' columns results in better result.<br>
Maybe there's a problem of calculating feature importances.<br>

In [None]:
# col_to_drop = df_train.columns[df_train.columns.str.startswith('ps_calc_')]
# df_train = df_train.drop(col_to_drop, axis=1)
# df_test = df_test.drop(col_to_drop, axis=1)

## Train + Predict

Create models with different sets of hyperparameters and use K-Fold cross validation for each model. Inferences from valid fold is used to train the ensemble model.<br>
Submit data also stack inference result for each model and pass to ensemble model.<br>
<br>
Overall process can be seen in the image below.


<img src='http://drive.google.com/uc?export=view&id=1EI7Nt-TjbeL0hYvD-wugl0AyJPRgUU1m' /><br>
<br>

shape of train data for ensemble model:<br>

    (len(train_data) , len(models))

shape of submit data for ensemble model:<br>
    
    (len(submit_data) , len(models))

In [None]:
import warnings
from sklearn.linear_model import LogisticRegression


class model_builder(object):
    def __init__(self, n_splits):
        self.n_splits = n_splits
        self.stacker = LogisticRegression()
        pass
    
    def fit_predict(self, X, y, T, params_list):
    
        # shape만 가져온다.
        y_valid_pred = 0 * y
        y_submit_pred = 0
        
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        
        stack_train = np.zeros((X.shape[0], len(params_list)))
        stack_test = np.zeros((T.shape[0], len(params_list)))

        for param_index, params in enumerate(params_list):
            # set up folds
            kfold = KFold(n_splits = self.n_splits, random_state = 1, shuffle = True)

            model = lgb.LGBMClassifier(**params)
            
            stack_test_i = np.zeros((T.shape[0], self.n_splits))            

            for fold_index, (train_index, val_index) in enumerate(kfold.split(X)):                
                print("model ", param_index, " fold ", fold_index)

                # create data for this fold
                X_train = X[train_index]
                y_train = y[train_index]
                X_valid = X[val_index]

                model.fit(X_train, y_train, verbose=True)

                # inference validation data with trained model
                # this will be used as train data for ensemble model(stacker)
                y_pred = model.predict_proba(X_valid)[:, 1]
                stack_train[val_index, param_index] = y_pred
                
                # inference submit data with trained model
                # this will be used as input of ensemble model(stacker)
                pred = model.predict_proba(T)
                stack_test_i[:, fold_index] = pred[:, 1]
            stack_test[:, param_index] = stack_test_i.mean(axis=1) # fold model 평균값을 저장
        
        # train ensemble model with stacked train data([?, len(params)])
        self.stacker.fit(stack_train, y)
        
        # inference with stacked submit data
        pred = self.stacker.predict_proba(stack_test)[:, 1]
        return pred


params_list = [
    {
        'learning_rate' : 0.02,
        'n_estimators' : 650,
        'max_bin' : 10,
        'subsample' : 0.8,
        'subsample_freq' : 10,
        'colsample_bytree' : 0.8,
        'min_child_samples' : 500,
        'seed' : 99
    },
    {
        'learning_rate' : 0.02,
        'n_estimators' : 1100,        
        'subsample' : 0.7,
        'subsample_freq' : 2,
        'colsample_bytree' : 0.3,        
        'num_leaves' : 16,
        'seed' : 99
    },
    {
        'n_estimators' : 1100,
        'max_depth' : 4,
        'learning_rate' : 0.02,
        'seed' : 99
    },
]

builder = model_builder(n_splits=N_SPLITS)
y_test_pred = builder.fit_predict(X=df_train, y=target_train, T=df_test, params_list=params_list)

In [None]:
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_test_pred
sub.to_csv('lgb_submit.csv', float_format='%.6f', index=False)