In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 
import pandas as pd 
import optuna
import lightgbm as lgb
from path import Path
from sklearn.model_selection import StratifiedKFold

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/porto-seguro-safe-driver-prediction/sample_submission.csv
/kaggle/input/porto-seguro-safe-driver-prediction/train.csv
/kaggle/input/porto-seguro-safe-driver-prediction/test.csv


In [2]:
class Config:
    input_path = Path('../input/porto-seguro-safe-driver-prediction')
    optuna_lgb = False
    n_estimators = 1500 
    early_stopping_round = 150
    cv_folds = 5
    random_state = 0
    params = {'objective':'binary',
             'boosting_type':'gbdt',
             'learning_rate': 0.01,
             'max_bins':25,
             'num_leaves':31,
             'min_child_samples':1500,
             'colsample_bytree':0.7,
             'subsample_freq':1,
             'subsample':0.7,
             'reg_alpha':1.0,
             'reg_lambda':1.0,
             'verbosity':0,
             'random_state':0}
config = Config()

In [3]:
train = pd.read_csv(config.input_path/'train.csv', index_col='id')
test = pd.read_csv(config.input_path/'test.csv', index_col='id')
submission = pd.read_csv(config.input_path/'sample_submission.csv', index_col='id')

In [4]:
calc_features = [feat for feat in train.columns if '_calc' in feat]
cat_features = [feat for feat in train.columns if '_cat' in feat]

target = train['target']
train = train.drop('target', axis=1)

In [5]:
### It was argued in the discussion that the calc features can be dropped. Since the are 
### engineered features, they do not contains new information in respect of their original
### features, but they just add noise to any model trained that comprises them

train = train.drop(calc_features, axis=1)
test = test.drop(calc_features, axis=1)

In [7]:
!pip install BorutaShap

Collecting BorutaShap
  Downloading BorutaShap-1.0.16-py3-none-any.whl (13 kB)
Installing collected packages: BorutaShap
Successfully installed BorutaShap-1.0.16
[0m

# **FEATURES SELECTION**
### Using Boruta-SHAP

In [10]:
## look Boruta features elimination notebook ##

# Importing core libraries
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib
from functools import partial

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Classifier/Regressor
from xgboost import XGBRegressor

# Feature selection
#from BorutaShap import BorutaShap

# Data processing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.mixture import GaussianMixture

# Validation
from sklearn.model_selection import StratifiedKFold

In [11]:
train = pd.get_dummies(train, columns=cat_features)
test = pd.get_dummies(test, columns=cat_features)

assert((train.columns == test.columns).all())

In [12]:
from numba import jit

@jit
def eval_gini(y_true,y_pred):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_pred)]
    ntrue=0
    gini=0
    delta=0
    n = len(y_true)
    for i in range(n-1,-1,-1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i*delta
        delta += 1 - y_i
    gini = 1- 2*gini / (ntrue*(n-ntrue))
    return gini

def gini_lgb(y_true, y_pred):
    eval_name = 'normalized_gini_coef'
    eval_result = eval_gini(y_true,y_pred)
    is_higher_better = True
    return eval_name, eval_result, is_higher_better

In [13]:
if config.optuna_lgb:
    def objective(trial):
            params = {
             'learning_rate': trial.suggest_float('learning_rate',0.01,1.0),
             'num_leaves':trial.suggest_int("num_leaves", 3,255),
             'min_child_samples':trial.suggest_int("min_chil_samples",3,3000),
             'colsample_bytree':trial.suggest_float("colsample_bytree",0.1,1.0),
             'subsample_freq':trial.suggest_int("subsample_freq",0,10),
             'subsample':trial.suggest_float("subsample",0.1,1.0),
             'reg_alpha':trial.suggest_loguniform("reg_alpha",1e-9,10.0),
             'reg_lambda':trial.suggest_loguniform("reg_lambda",1e-9,10.0)
            }
            score = list()
            skf = StratifiedKFold(n_splits=config.cv_folds,shuffle=True,
                                 random_state=config.random_state)
            
            for train_idx, valid_idx in skf.split(train, target):
                X_train = train.iloc[train_idx]
                y_train = target.iloc[train_idx]
                X_valid = train.iloc[valid_idx]
                y_valid = target.iloc[valid_idx]
                
                model = lgb.LGBMClassifier(**params, 
                                           n_estimators=1500,
                                          early_stopping_round = 150,
                                          force_row_wise=True)
                callbacks = [lgb.early_stopping(stopping_rounds=150,
                                               verbose=False)]
                model.fit(X_train, y_train,
                         eval_set=[(X_valid,y_valid)],
                         eval_metric = gini_lgb,callbacks=callbacks)
                score.append(model.best_score_['valid_0']['normalized_gini_coef'])
            return np.mean(score)
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=300)
    print("Best Gini Normalized Score", study.best_value)
    print("Best parameteres", study.best_params)
        
    params = {'objective':'binary',
                 'boosting_type':'gbdt',
                  'verbosity':0,
                  'random_state':0}
    params.update(study.best_params)
else:
    params = config.params
            
                

In [20]:
%%time 
preds = np.zeros(len(test))
oof = np.zeros(len(train))
metric_evaluations = list()

skf = StratifiedKFold(n_splits = config.cv_folds, shuffle=True, random_state=config.random_state)

for idx, (train_idx, valid_idx) in enumerate(skf.split(train, target)):
    print(f"CV fold {idx}")
    X_train, y_train = train.iloc[train_idx], target.iloc[train_idx]
    X_valid, y_valid = train.iloc[valid_idx], target.iloc[valid_idx]
    model = lgb.LGBMClassifier(**params, n_estimators=config.n_estimators, 
                               early_stopping_round = config.early_stopping_round, 
                               force_row_wise = True )
    callbacks = [lgb.early_stopping(stopping_rounds=150),
                                    lgb.log_evaluation(period=100, show_stdv=False)]
    model.fit(X_train, y_train,
                eval_set=[(X_valid,y_valid)],
                eval_metric = gini_lgb,callbacks=callbacks)
    metric_evaluations.append(model.best_score_['valid_0']['normalized_gini_coef'])
    preds += (model.predict_proba(test, num_iteration = model.best_iteration_)[:,1]/skf.n_splits)
    
    oof[valid_idx] = model.predict_proba(X_valid,num_iteration=model.best_iteration_)[:,1]
    
    

CV fold 0
Training until validation scores don't improve for 150 rounds
[100]	valid_0's binary_logloss: 0.153243	valid_0's normalized_gini_coef: 0.271457
[200]	valid_0's binary_logloss: 0.15228	valid_0's normalized_gini_coef: 0.280599
[300]	valid_0's binary_logloss: 0.15185	valid_0's normalized_gini_coef: 0.286829
[400]	valid_0's binary_logloss: 0.151651	valid_0's normalized_gini_coef: 0.289906
[500]	valid_0's binary_logloss: 0.151543	valid_0's normalized_gini_coef: 0.291906
[600]	valid_0's binary_logloss: 0.151473	valid_0's normalized_gini_coef: 0.293377
[700]	valid_0's binary_logloss: 0.151437	valid_0's normalized_gini_coef: 0.293827
[800]	valid_0's binary_logloss: 0.151417	valid_0's normalized_gini_coef: 0.294276
[900]	valid_0's binary_logloss: 0.15142	valid_0's normalized_gini_coef: 0.294119
Early stopping, best iteration is:
[806]	valid_0's binary_logloss: 0.151416	valid_0's normalized_gini_coef: 0.294311
CV fold 1
Training until validation scores don't improve for 150 rounds
[100

In [26]:
print(f"LightGBM CV normalized gin coefficient: {np.mean(metric_evaluations):0.3f}"
      f"({np.std(metric_evaluations):0.3f})")

LightGBM CV normalized gin coefficient: 0.289(0.015)


In [27]:
submission['target']=preds
submission.to_csv('lgb_submission.csv')

oofs = pd.DataFrame({'id':train.index,'target':oof})
oofs.to_csv('lgb_oof.csv', index=False)