In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
#import catboost as ct

from scipy import stats
from scipy.stats import ks_2samp

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv("../input/cs-training.csv").drop(['Unnamed: 0'], axis=1)
data.columns
data.describe()

In [None]:
# Separando em duas bases: uma somente com as features, outra somente com o target.

base_feature = data.drop(['SeriousDlqin2yrs'], axis=1)
base_target  = data['SeriousDlqin2yrs']

# Função que separa as bases em treino e teste.

x_train, x_test, y_train, y_test = train_test_split(base_feature, base_target, test_size=0.4, random_state=32898)

# Visualização da base de treino.

x_train

In [None]:
print("# Grid Search de certos hiperparâmetros XGBoost...")
print()

# Parameter Tuning
xgb_model = xgb.XGBClassifier(is_unbalance = True, seed=32467, silent=False)
param_dist= {"max_depth": [5,10],#[10,30,50],
              #"min_child_weight" : [1,3,6],
              "reg_lambda": [2,5],
              "n_estimators": [100,200],
              "learning_rate": [0.1,0.3],}#[0.05, 0.1,0.16],}

grid_search = GridSearchCV(xgb_model, param_grid=param_dist, scoring='roc_auc', cv=5, verbose=0, n_jobs=-1, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_estimator_

modelo=grid_search.best_estimator_
#modelo = xgb_grid.fit(x_train.as_matrix(), y_train.as_matrix().ravel())

print("Melhores parâmetros do grid:")
print()
print(modelo.best_params_)
print()
print("AUROC na base de desenvolvimento (K-fold):")
print()
means = modelo.cv_results_['mean_test_score']
stds = modelo.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, modelo.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

In [None]:
xgb_model = xgb.XGBClassifier(is_unbalance = True, seed=32467, silent=False, max_depth=5, reg_lambda=2, n_estimators=100, learning_rate=0.1)

In [None]:
x_test_np = x_test.as_matrix()
y_test_np = y_test.as_matrix()

# Predição do modelo na base de teste.

test_pred = modelo.predict_proba(x_test)

# Anexando a resposta da base de teste para cálculo de KS.

test_result = pd.DataFrame(y_test_np, columns=['Target'])
test_result['Score'] = test_pred[:,0]

# Separação da base escorada de teste em bons e maus.

n_good = test_result[test_result['Target'] == 0]
n_bad  = test_result[test_result['Target'] == 1]

# Cálculo de KS na base de teste.

ks_test = round(100*stats.ks_2samp(n_good['Score'], n_bad['Score'])[0],1)
print('KS na base de teste =', ks_test)

In [None]:
print("# Grid Search de certos hiperparâmetros LGBM...")
print()

# Parameter Tuning
lgb_model = lgb.LGBMClassifier(is_unbalance = True, seed=32467, silent=False)

param_dist= {"boosting_type": ['dart','goss'],
             "max_depth": [10,30,50],
             "min_child_weight" : [1,3,6],
             "n_estimators": [200],
             "learning_rate": [0.05, 0.1,0.16]}

grid_search2 = GridSearchCV(lgb_model, param_grid=param_dist, scoring='roc_auc', cv=5, verbose=10, n_jobs=-1, return_train_score=True)
#grid_search2.fit(x_train, y_train)

#grid_search.best_estimator_

modelo2 = grid_search2.fit(x_train.as_matrix(), y_train.as_matrix().ravel())

print("Melhores parâmetros do grid:")
print()
print(modelo2.best_params_)
print()
print("AUROC na base de desenvolvimento (K-fold):")
print()
means = modelo2.cv_results_['mean_test_score']
stds = modelo2.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, modelo2.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

In [None]:
x_test_np = x_test.as_matrix()
y_test_np = y_test.as_matrix()

# Predição do modelo na base de teste.

test_pred2 = modelo2.predict_proba(x_test)

# Anexando a resposta da base de teste para cálculo de KS.

test_result2 = pd.DataFrame(y_test_np, columns=['Target'])
test_result2['Score'] = test_pred2[:,0]

# Separação da base escorada de teste em bons e maus.

n_good2 = test_result2[test_result2['Target'] == 0]
n_bad2  = test_result2[test_result2['Target'] == 1]

# Cálculo de KS na base de teste.

ks_test2 = np.round(stats.ks_2samp(n_good2['Score'], n_bad2['Score'])[0],3)*100
print('KS na base de teste =', ks_test2)