In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import  make_scorer
from skopt import BayesSearchCV
from xgboost import XGBClassifier

In [3]:
def gini(actual, pred):
    actual = np.asarray(actual)
    length = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    gini_sum = a_c.sum() / a_s.sum() - (length + 1) / 2.0
    return gini_sum / length
 
def gini_normalized(actual, pred):
    if pred.ndim == 2:
        pred = pred[:, 1]
    return gini(actual, pred) / gini(actual, actual)

gini_scorer = make_scorer(gini_normalized)

In [4]:
data = pd.read_csv("../data/train.csv", na_values=[-1, -1.0], index_col="id")
X, y = data.loc[:, data.columns != "target"], data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=10)



#### Standard 

In [13]:
model = XGBClassifier()
model.fit(X_train.fillna(X_train.mean()), y_train)
y_pred = model.predict_proba(X_test.fillna(X_train.mean()))[:, 1]
std_gini = gini_normalized(y_test, y_pred)
print(std_gini)

0.273187883372


#### Randomized Search

In [5]:
estimator = XGBClassifier()
param_dist = {
    'max_depth': [3,5,8,10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100,150],
    'gamma': [0, 0.1],
    'min_child_weight': [1,10,20,40],
    'reg_alpha': [0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1],
    'seed': [42]
}
model = RandomizedSearchCV(estimator, param_distributions=param_dist, n_iter=50, cv=5, n_jobs=2, verbose=True)
model.fit(X_train.fillna(X_train.mean()), y_train)
y_pred = model.predict_proba(X_test.fillna(X_train.mean()))[:, 1]
random_gini = gini_normalized(y_test, y_pred)
print(random_gini)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 92.3min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed: 361.3min
[Parallel(n_jobs=2)]: Done 250 out of 250 | elapsed: 471.3min finished


0.242726114722


In [None]:
estimator = LGBMClassifier()
param_dist = {
    'max_depth': [3,5,8,10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100,150],
    'min_child_weight': [1,10,20,40],
    'reg_alpha': [0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1],
    'random_state': [42]
}
model = RandomizedSearchCV(estimator, param_distributions=param_dist, n_iter=50, scoring=gini_scorer, cv=5, n_jobs=-1, verbose=True)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
random_gini = gini_normalized(y_test, y_pred)
print(lgbm_random_gini)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 44.8min


#### Bayesian Optimization 

In [13]:
estimator = XGBClassifier(seed=42)
param_dist = {
    'max_depth': [3,5,8,10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100,150],
    'gamma': [0, 0.1],
    'min_child_weight': [1,10,20,40],
    'reg_alpha': [0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]
}
model = BayesSearchCV(estimator, search_spaces=param_dist, n_iter=50, scoring=gini_scorer, cv=5, n_jobs=-1, verbose=True)
model.fit(X_res, y_res)
y_pred = model.predict_proba(X_test)[:, 1]
bayes_gini = gini_normalized(y_test, y_pred)
print(bayes_gini)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.2min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.5min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.6min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   33.9s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   38.8s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   39.9s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.4min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.2min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   54.1s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   41.3s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.2min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   48.9s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   58.8s finished


0.267048095342


In [None]:
estimator = LGBMClassifier()
param_dist = {
    'max_depth': [3,5,8,10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100,150],
    'gamma': [0, 0.1],
    'min_child_weight': [1,10,20,40],
    'reg_alpha': [0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1],
    'seed': [42]
}
model = BayesSearchCV(estimator, param_grid=param_dist, scoring=gini_scorer, cv=5, n_jobs=-1, verbose=True)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
lgbm_bayes_gini = gini_normalized(y_test, y_pred)
print(lgbm_bayes_gini)