In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn import model_selection, metrics
from sklearn.model_selection import GridSearchCV

In [3]:
import lightgbm as lgb
print(lgb.__version__)
!python --version

2.2.2
Python 3.7.2


## 1. _Data preparation_

In [4]:
X_train = pd.read_csv('../../03-boosting/train_medium.csv')
y_train = X_train['Disbursed']

X_test = pd.read_csv('../../03-boosting/test_medium.csv')
y_test = X_test['Disbursed']

X_train, X_test = [ x.drop('Disbursed', 1) for x in [X_train, X_test] ]

[ a.shape for a in [X_train, y_train, X_test, y_test] ]

[(65265, 50), (65265,), (21755, 50), (21755,)]

In [5]:
X_train.dtypes.value_counts()

int64      47
float64     3
dtype: int64

In [6]:
y_test.value_counts()

0.0    21431
1.0      324
Name: Disbursed, dtype: int64

In [7]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,Existing_EMI,Loan_Amount_Applied,Loan_Tenure_Applied,Monthly_Income,Var4,Var5,Age,EMI_Loan_Submitted_Missing,Interest_Rate_Missing,...,Var2_2,Var2_3,Var2_4,Var2_5,Var2_6,Mobile_Verified_0,Mobile_Verified_1,Source_0,Source_1,Source_2
0,7881,0.0,100000.0,3.0,29000,4,11,29,0,0,...,0,0,0,0,0,0,1,0,1,0
1,189,21000.0,450000.0,5.0,48000,1,0,27,1,1,...,0,0,0,0,0,1,0,0,1,0
2,64000,3832.0,100000.0,3.0,25000,5,11,44,0,0,...,0,0,0,0,1,0,1,1,0,0
3,61629,0.0,200000.0,3.0,22500,3,1,26,1,1,...,0,0,0,0,1,0,1,1,0,0
4,23212,0.0,50000.0,1.0,17943,1,0,25,1,1,...,0,0,0,0,0,1,0,0,1,0


## 2. _Basic classifier_

In [8]:
from lightgbm import LGBMClassifier

In [9]:
model = LGBMClassifier(num_leaves=30, learning_rate=0.05, n_estimators=20)

In [10]:
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    early_stopping_rounds=25,
    verbose=False
)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.05, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=20, n_jobs=-1, num_leaves=30, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [11]:
def print_scores(dataset_name, X, y_true, y_pred, y_proba):
    print("[{}]".format(dataset_name))
    print("Accuracy : {:.3f}".format(metrics.accuracy_score(y_true, y_pred)))
    print("AUC Score: {:.3f}".format(metrics.roc_auc_score(y_true, y_proba)))
    print(metrics.classification_report(y_true, y_pred, digits=3))
    print(60 * "*")

In [12]:
print_scores("train", X_train, y_train, model.predict(X_train), model.predict_proba(X_train)[:,1])

print_scores("test", X_test, y_test, model.predict(X_test), model.predict_proba(X_test)[:,1])

[train]
Accuracy : 0.985
AUC Score: 0.883
              precision    recall  f1-score   support

         0.0      0.985     1.000     0.993     64316
         1.0      0.000     0.000     0.000       949

   micro avg      0.985     0.985     0.985     65265
   macro avg      0.493     0.500     0.496     65265
weighted avg      0.971     0.985     0.978     65265

************************************************************


  'precision', 'predicted', average, warn_for)


[test]
Accuracy : 0.985
AUC Score: 0.829
              precision    recall  f1-score   support

         0.0      0.985     1.000     0.992     21431
         1.0      0.000     0.000     0.000       324

   micro avg      0.985     0.985     0.985     21755
   macro avg      0.493     0.500     0.496     21755
weighted avg      0.970     0.985     0.978     21755

************************************************************


  'precision', 'predicted', average, warn_for)


В общем, с объектами класса <b>1</b> тоже не справляется.

In [13]:
feature_idx = np.argsort(model.feature_importances_)[::-1]

list(zip(X_train.columns[feature_idx], model.feature_importances_[feature_idx]))

[('Monthly_Income', 125),
 ('Unnamed: 0', 74),
 ('Var5', 66),
 ('Age', 58),
 ('Existing_EMI', 50),
 ('Loan_Amount_Applied', 46),
 ('Var4', 36),
 ('Loan_Tenure_Applied', 16),
 ('Source_2', 12),
 ('Source_0', 11),
 ('Var1_2', 11),
 ('Loan_Amount_Submitted_Missing', 10),
 ('Filled_Form_0', 10),
 ('Device_Type_0', 8),
 ('Source_1', 6),
 ('Var1_11', 6),
 ('Var1_3', 5),
 ('Gender_0', 5),
 ('Var1_9', 3),
 ('Var1_8', 3),
 ('Var2_2', 3),
 ('EMI_Loan_Submitted_Missing', 3),
 ('Var1_10', 2),
 ('Processing_Fee_Missing', 2),
 ('Var1_15', 2),
 ('Mobile_Verified_1', 2),
 ('Device_Type_1', 2),
 ('Filled_Form_1', 1),
 ('Var1_1', 1),
 ('Var1_5', 1),
 ('Interest_Rate_Missing', 0),
 ('Var2_0', 0),
 ('Var1_18', 0),
 ('Mobile_Verified_0', 0),
 ('Var2_6', 0),
 ('Var2_5', 0),
 ('Var2_4', 0),
 ('Var2_3', 0),
 ('Var1_17', 0),
 ('Var2_1', 0),
 ('Var1_0', 0),
 ('Loan_Tenure_Submitted_Missing', 0),
 ('Var1_14', 0),
 ('Var1_13', 0),
 ('Var1_12', 0),
 ('Var1_7', 0),
 ('Var1_6', 0),
 ('Var1_4', 0),
 ('Gender_1', 0),


## 3. _Hyperparams_

Попробуем подобрать параметры, используя метрику, более ориентированную на **recall** для класса <b>1</b>.

In [14]:
beta = 2.

fbeta_scorer = metrics.make_scorer(metrics.fbeta_score, pos_label=1, average='binary', beta=beta)

In [15]:
%%time

param_grid = {
    'learning_rate' : [0.1, 0.05, 0.01],
    'num_leaves' : [30, 100, 500],
    'n_estimators' : [10, 300]
}

gs = GridSearchCV(LGBMClassifier(random_seed=199), param_grid, scoring=fbeta_scorer, n_jobs=-1, iid=False, cv=5)

gs.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=25, verbose=False)

Wall time: 1min 11s


In [16]:
model = gs.best_estimator_

gs.best_params_, gs.best_score_

({'learning_rate': 0.1, 'n_estimators': 300, 'num_leaves': 30},
 0.010398371322383537)

In [17]:
model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=300, n_jobs=-1, num_leaves=30, objective=None,
        random_seed=199, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

In [18]:
print_scores("train", X_train, y_train, model.predict(X_train), model.predict_proba(X_train)[:,1])

print_scores("test", X_test, y_test, model.predict(X_test), model.predict_proba(X_test)[:,1])

[train]
Accuracy : 0.986
AUC Score: 0.934
              precision    recall  f1-score   support

         0.0      0.986     1.000     0.993     64316
         1.0      0.846     0.012     0.023       949

   micro avg      0.986     0.986     0.986     65265
   macro avg      0.916     0.506     0.508     65265
weighted avg      0.984     0.986     0.979     65265

************************************************************
[test]
Accuracy : 0.985
AUC Score: 0.832
              precision    recall  f1-score   support

         0.0      0.985     1.000     0.992     21431
         1.0      0.250     0.003     0.006       324

   micro avg      0.985     0.985     0.985     21755
   macro avg      0.618     0.501     0.499     21755
weighted avg      0.974     0.985     0.978     21755

************************************************************


Hемного удалось улучшить **recall** по классу <b>1</b>, но в целом тоже слабовато.