# Import and load data

In [None]:
# Data process
import math
import numpy as np
import pandas as pd
import dask.dataframe as dd
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import train_test_split


# Modelling
from sklearn.ensemble import RandomForestRegressor as RFF
import xgboost as xgb
import lightgbm as lgb


# Statis
from scipy.stats import kurtosis, skew
from sklearn.metrics import roc_auc_score,mean_squared_error



# Utilities
from pprint import pprint
from datetime import datetime
import time
import os
from os.path import join as pjoin


# Plot
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (18,12)


import warnings
warnings.filterwarnings('ignore')



# Training model 

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def run_lgb_cv(X,y,params=None,n_round=50,n_fold=5,categorical_feature=[]):
    # credits : https://www.kaggle.com/delayedkarma/let-s-add-some-new-features-lb-0-674
    folds = KFold(n_splits=n_fold, shuffle=False, random_state=15)
    oof = np.zeros(len(X))
    clfs = []

    score = [0 for _ in range(folds.n_splits)]

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print("Fold no.{}".format(fold_+1))
        trn_data = lgb.Dataset(X.iloc[trn_idx],
                               label=y.iloc[trn_idx],
                               categorical_feature = categorical_feature
                              )
        val_data = lgb.Dataset(X.iloc[val_idx],
                               label=y.iloc[val_idx],
                               categorical_feature = categorical_feature
                              )
        
        clf = lgb.train(params,
                        trn_data,
                        n_round,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=50,
                        early_stopping_rounds = 50)
        clfs.append(clf)

        oof[val_idx] = clf.predict(X.iloc[val_idx], num_iteration=clf.best_iteration)

        score[fold_] = roc_auc_score(y.iloc[val_idx], oof[val_idx])

    
    print("CV score: {:<8.5f}".format(roc_auc_score(y, oof)*100))

    return clfs
    
    

In [None]:
def run_lgb(X,y,rate=.8,n_round=50,params=None):
    if params is None:
        params = {
              "objective": "binary",
#               "boosting_type": "gbdt",
              "learning_rate": 0.02,
              "max_depth": 8,
              "num_leaves": 67,
              "n_estimators": n_round,
              "bagging_fraction": 0.4,
              "feature_fraction": 0.5,
              "bagging_freq": 5,
              "bagging_seed": 2018,
              "min_child_samples": 80,
              "min_child_weight": 100,
              "min_split_gain": 0.1,
              "reg_alpha": 0.005,
              "reg_lambda": 0.1,
              "subsample_for_bin": 25000,
              "min_data_per_group": 100,
              "max_cat_to_onehot": 4,
              "cat_l2": 25,
              "cat_smooth": 2,
              "max_cat_threshold": 32,
              "random_state": 1,
              "silent": True,
              "metric": "auc"
        }
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=1-rate, random_state=42)

    # LightGBM Regressor estimator
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=20, eval_metric='auc',
        early_stopping_rounds=n_round
    )

    y_pred = model.predict(X_valid, num_iteration=model.best_iteration_)

    score = roc_auc_score(y_valid, y_pred)
    print(f'Final score {score*100:.2f}')
    return model

In [None]:
def predict_lgb_cv(clfs,X,categorical_feature=[]):
    preds = np.zeros((len(X),len(clfs)))
    t = time.time()
    for i,clf in enumerate(clfs):
        preds[:,i] = clf.predict(X)
        print(f"clf {i} predicted in {time.time()-t}")
        t = time.time()
    y_ = np.mean(preds,axis=1)
    return y_

In [None]:
def run_xgb_cv(X,y,rate=.8,params=None,n_round=50,nfold=5):
    if params is None:
        params = {
            'eta': 0.05,
            'max_depth': 5,
            'subsample': 0.7,
            'colsample_bytree': 0.7,
            'objective': 'reg:linear',
            'eval_metric': 'auc',
            'silent': 1
        }
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=1-rate, random_state=42)
    
    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid,y_valid)
    
    model = xgb.cv(params, dtrain, num_boost_round=n_round, early_stopping_rounds=20,
                        verbose_eval=50, show_stdv=False,nfold=nfold,metrics='auc')
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration_)

    score = roc_auc_score(y_valid, y_pred)
    
    return model

    

In [None]:
def run_xgb(X,y,data=None,rate=.9,params=None,n_round=50):
    if params is None:
        params = {
            'eta': 0.05,
            'max_depth': 5,
            'subsample': 0.7,
            'colsample_bytree': 0.7,
            'objective': 'reg:linear',
            'eval_metric': 'auc',
            'silent': 1
        }
    
    if data is None:
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=1-rate, random_state=42)

        dtrain = xgb.DMatrix(X_train, y_train)
        dval = xgb.DMatrix(X_valid,y_valid)
    else:
        dtrain,dval = data

    model = xgb.train(params, dtrain, num_boost_round=n_round, early_stopping_rounds=50,
                        evals= [(dtrain, 'train'), (dval, 'valid')],
                        verbose_eval=20)
    
    return model

    