In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC

In [2]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test_bqCt9Pv.csv')
data.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default
0,420825,50578,58400,89.55,67,22807,45,1441,1/1/1984,Salaried,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,31-07-85,Self employed,...,0,0,1991,0,0,1,1yrs 11mon,1yrs 11mon,0,1
2,417566,53278,61360,89.63,67,22807,45,1497,24-08-85,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,30-12-93,Self employed,...,0,0,31,0,0,0,0yrs 8mon,1yrs 3mon,1,1
4,539055,52378,60300,88.39,67,22807,45,1495,12/9/1977,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,1,1


In [28]:
data.loan_default.value_counts()

0    182543
1     50611
Name: loan_default, dtype: int64

In [3]:
a = pd.to_datetime(data['Date.of.Birth'])
a = a.dt.strftime('%y')
a = a.astype(int)
b = pd.to_datetime(test['Date.of.Birth'])
b = b.dt.strftime('%y')
b = b.astype(int)
data.insert(8, value = a, column = 'B.year')
test.insert(8, value = b, column = 'B.year')

In [4]:
c = pd.to_datetime(data['DisbursalDate'])
c = c.dt.strftime('%d')
c = c.astype(int)
d = pd.to_datetime(test['DisbursalDate'])
d = d.dt.strftime('%d')
d = d.astype(int)
data.insert(9, value = c, column = 'DM')
test.insert(9, value = d, column = 'DM')

In [5]:
e = []
for day in data.DM:
    if 11 <= day <= 20:
        e.append('GR1')
    else:
        e.append('GR2')

In [6]:
f = []
for day in test.DM:
    if 11 <= day <= 20:
        f.append('GR1')
    else:
        f.append('GR2')

In [7]:
data.insert(10, value = e, column = 'DisbursalGroup')
test.insert(10, value = f, column = 'DisbursalGroup')

In [8]:
data['Employment.Type'].fillna(value = 'Unknown', inplace = True )
test['Employment.Type'].fillna(value = 'Unknown', inplace = True )

In [9]:
data.columns

Index(['UniqueID', 'disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID', 'B.year', 'DM',
       'DisbursalGroup', 'Date.of.Birth', 'Employment.Type', 'DisbursalDate',
       'State_ID', 'Employee_code_ID', 'MobileNo_Avl_Flag', 'Aadhar_flag',
       'PAN_flag', 'VoterID_flag', 'Driving_flag', 'Passport_flag',
       'PERFORM_CNS.SCORE', 'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS',
       'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE',
       'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS',
       'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE',
       'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT',
       'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'AVERAGE.ACCT.AGE',
       'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES', 'loan_default'],
      dtype='object')

In [9]:
data = pd.get_dummies(data, columns = ['Employment.Type', 'PERFORM_CNS.SCORE.DESCRIPTION', 'DisbursalGroup'])
test = pd.get_dummies(test, columns = ['Employment.Type', 'PERFORM_CNS.SCORE.DESCRIPTION', 'DisbursalGroup'])

In [10]:
data['ad'] = data.Aadhar_flag + data.Driving_flag
test['ad'] = test.Aadhar_flag + test.Driving_flag

In [11]:
skip = ['Date.of.Birth', 'DisbursalDate','DM', 
        'PERFORM_CNS.SCORE.DESCRIPTION_Not Scored: More than 50 active Accounts found','CREDIT.HISTORY.LENGTH',
       'AVERAGE.ACCT.AGE']
y = ['loan_default']
X = [x for x in data.columns if x not in y+skip]

In [12]:
xtr = data[X]
ytr = data[y]
xts = test[X]
print(xtr.shape)
print(ytr.shape)
print(xts.shape)

(233154, 60)
(233154, 1)
(112392, 60)


In [13]:
xtr, xv, ytr, yv = train_test_split(xtr, ytr, test_size = 0.20, random_state = 0)
print(xtr.shape)
print(ytr.shape)
print(xv.shape)
print(yv.shape)

(186523, 60)
(186523, 1)
(46631, 60)
(46631, 1)


In [15]:
model_lr = LogisticRegression().fit(xtr, ytr)
yp = model_lr.predict_proba(xtr)
yp = yp[:,1]
roc_auc_score(ytr, yp)

0.5717407803048926

In [188]:
ypts = model_lr.predict_proba(xts)
ypts = ypts[:,1]
sub = test[['UniqueID']]
sub['loan_default'] = ypts
sub.to_csv('sub_lr2.csv', index = False)

In [238]:
model_xgb.get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)>

In [16]:
model_xgb = XGBClassifier(colsample_bylevel=0.85, n_estimators= 200, max_depth= 2, reg_lambda=3.5,
                         reg_alpha=1.5, min_child_weight= 6, max_delta_step = 8).fit(xtr,ytr)
yp = model_xgb.predict_proba(xtr)
yp = yp[:,1]
print('Model Report:')
print('Train auc:', roc_auc_score(ytr, yp))
ypv = model_xgb.predict_proba(xv)
ypv = ypv[:,1]
print('Validation auc:', roc_auc_score(yv, ypv))

Model Report:
Train auc: 0.6643366171978543
Validation auc: 0.6528054884944351


In [17]:
yptr = model_xgb.predict(xtr)
confusion_matrix(ytr, yptr)

array([[145803,    126],
       [ 40425,    169]], dtype=int64)

In [122]:
ypvc = model_xgb.predict(xv)
confusion_matrix(yv,ypvc)

array([[36614,     0],
       [10017,     0]], dtype=int64)

In [123]:
ypts = model_xgb.predict_proba(xts)
ypts = ypts[:,1]
sub = test[['UniqueID']]
sub['loan_default'] = ypts
sub.to_csv('sub_xgb12.csv', index = False)

In [125]:
xtr, xv, ytr, yv = train_test_split(xtr, ytr, test_size = 0.2, random_state = 0)
print(xtr.shape)
print(ytr.shape)
print(xv.shape)
print(yv.shape)

(186523, 60)
(186523, 1)
(46631, 60)
(46631, 1)


In [126]:
train_data = lgb.Dataset(xtr, label = ytr)
val_data = lgb.Dataset(xv, label = yv)

In [127]:
param = {"num_leaves": 2,
         "learning_rate": 0.1,
         "objective":'binary', 
         "scale_pos_weight": 0.23,
         "metric": 'auc',
        "bagging_fraction" : 0.5,
        "feature_fraction" : 1,
        "bagging_freq" : 2,
        "reg_alpha" : 4.5,
        "reg_lambda" : 1.5,
        "random_state" : 0,
        "bagging_seed" : 300,
        "verbosity" : -1,
        "max_depth": 1,
        "min_child_samples": 50,
        "save_binary": True}
mlgb = lgb.train(param, train_data, 200000, valid_sets = [val_data], early_stopping_rounds = 4000, verbose_eval=2000)

Training until validation scores don't improve for 4000 rounds.
[2000]	valid_0's auc: 0.654262
[4000]	valid_0's auc: 0.65617
[6000]	valid_0's auc: 0.656935
[8000]	valid_0's auc: 0.65733
[10000]	valid_0's auc: 0.657891
[12000]	valid_0's auc: 0.658329
[14000]	valid_0's auc: 0.658267
[16000]	valid_0's auc: 0.65848
[18000]	valid_0's auc: 0.658656
[20000]	valid_0's auc: 0.658932
[22000]	valid_0's auc: 0.659342
[24000]	valid_0's auc: 0.659298
[26000]	valid_0's auc: 0.659506
[28000]	valid_0's auc: 0.659335
[30000]	valid_0's auc: 0.659475
Early stopping, best iteration is:
[26120]	valid_0's auc: 0.659693


In [128]:
yptr = mlgb.predict(xtr, num_iteration = mlgb.best_iteration)
roc_auc_score(ytr, yptr)

0.6809313245495154

In [129]:
ypts = mlgb.predict(xts, num_iteration = mlgb.best_iteration)
sub = test[['UniqueID']]
sub['loan_default'] = ypts
sub.to_csv('sub_mlgb23.csv', index = False)

In [822]:
def bayes_parameter_opt_lgb(xtr, ytr, init_round=20, opt_round=20, n_folds=2, random_seed=300, n_estimators=500000, 
                            learning_rate=0.1, max_depth =1, num_leaves = 2, output_process=False):
    train_data = lgb.Dataset(xtr, label = ytr)
    def lgb_crossval(bagging_fraction,feature_fraction,reg_alpha,reg_lambda,min_child_samples,scale_pos_weight):
        params = { "objective":'binary',
             "num_leaves": num_leaves,
             "learning_rate": learning_rate,
             "num_iterations": n_estimators,
             "metric": 'auc',
             "scale_pos_weight": scale_pos_weight,
             "bagging_fraction" : max(min(bagging_fraction,1), 0),
             "feature_fraction" : max(min(feature_fraction,1), 0),
             "reg_alpha" : max(reg_alpha,0),
             "reg_lambda" : max(reg_lambda,0),
             "random_state" : 0,
             "bagging_seed" : random_seed,
             "verbosity" : -1,
             "max_depth": max_depth,
             "min_child_samples": int(min_child_samples),
             "early_stopping_round": 4000,
             "save_binary": True}
        cv_result = lgb.cv(params, train_data, nfold = n_folds, stratified = True, seed = random_seed, verbose_eval = 20000, 
                           metrics = ['auc'])
        return max(cv_result['auc-mean'])
    lgb_opt = BayesianOptimization(lgb_crossval,{"bagging_fraction":(0.1,1),
                                            "feature_fraction":(0.1,1),
                                            "reg_alpha":(0.1,5.5),
                                            "reg_lambda":(0.1,5.5),
                                            "scale_pos_weight": (0.1,0.9),
                                            "min_child_samples":(20,200)},
                                            random_state = 0)
    lgb_opt.maximize(init_points = init_round, n_iter = opt_round)
    return lgb_opt.max
opt_params = bayes_parameter_opt_lgb(xtr, ytr, init_round=20, opt_round=20, n_folds=2, random_seed=300, n_estimators=500000, 
                                     learning_rate=0.1)

|   iter    |  target   | baggin... | featur... | min_ch... | reg_alpha | reg_la... | scale_... |
-------------------------------------------------------------------------------------------------
[20000]	cv_agg's auc: 0.662123 + 0.00124179
[40000]	cv_agg's auc: 0.662391 + 0.000974952
| [0m 1       [0m | [0m 0.6624  [0m | [0m 0.5939  [0m | [0m 0.7437  [0m | [0m 128.5   [0m | [0m 3.042   [0m | [0m 2.388   [0m | [0m 0.6167  [0m |
[20000]	cv_agg's auc: 0.662012 + 0.00123188
| [0m 2       [0m | [0m 0.6622  [0m | [0m 0.4938  [0m | [0m 0.9026  [0m | [0m 193.5   [0m | [0m 2.171   [0m | [0m 4.375   [0m | [0m 0.5231  [0m |
[20000]	cv_agg's auc: 0.661602 + 0.00098658
| [0m 3       [0m | [0m 0.6617  [0m | [0m 0.6112  [0m | [0m 0.933   [0m | [0m 32.79   [0m | [0m 0.5705  [0m | [0m 0.2092  [0m | [0m 0.7661  [0m |
[20000]	cv_agg's auc: 0.662101 + 0.00136284
[40000]	cv_agg's auc: 0.662379 + 0.00107501
| [95m 4       [0m | [95m 0.6624  [0m | [95m 0.8

[20000]	cv_agg's auc: 0.662061 + 0.00144171
[40000]	cv_agg's auc: 0.662401 + 0.00117947
[60000]	cv_agg's auc: 0.662464 + 0.00104493
| [95m 22      [0m | [95m 0.6625  [0m | [95m 0.3749  [0m | [95m 0.5798  [0m | [95m 20.0    [0m | [95m 4.876   [0m | [95m 5.407   [0m | [95m 0.6296  [0m |
[20000]	cv_agg's auc: 0.662146 + 0.00137723
[40000]	cv_agg's auc: 0.662465 + 0.00109192
| [95m 23      [0m | [95m 0.6625  [0m | [95m 0.1657  [0m | [95m 0.3337  [0m | [95m 20.12   [0m | [95m 5.336   [0m | [95m 5.234   [0m | [95m 0.8998  [0m |
[20000]	cv_agg's auc: 0.662124 + 0.00142519
[40000]	cv_agg's auc: 0.662445 + 0.00115747
[60000]	cv_agg's auc: 0.662499 + 0.00102638
| [95m 24      [0m | [95m 0.6625  [0m | [95m 0.3128  [0m | [95m 0.8163  [0m | [95m 20.28   [0m | [95m 5.322   [0m | [95m 0.1484  [0m | [95m 0.757   [0m |
[20000]	cv_agg's auc: 0.661899 + 0.00109505
| [0m 25      [0m | [0m 0.662   [0m | [0m 0.9356  [0m | [0m 0.9101  [0m | [0m 20.05  

In [823]:
print(opt_params)

{'target': 0.6625086045198991, 'params': {'bagging_fraction': 0.3128269186616841, 'feature_fraction': 0.8163226880143511, 'min_child_samples': 20.283533702657603, 'reg_alpha': 5.322253641935961, 'reg_lambda': 0.14836246605938477, 'scale_pos_weight': 0.7570120593192112}}


In [838]:
param = {"num_leaves": 2,
         "learning_rate": 0.1,
         "objective":'binary', 
         "scale_pos_weight": 0.7570120593192112,
         "metric": 'auc',
        "bagging_fraction" : 0.3128269186616841,
        "feature_fraction" : 0.8163226880143511,
        "bagging_freq" : 2,
        "reg_alpha" : 5.322253641935961,
        "reg_lambda" : 0.14836246605938477,
        "random_state" : 0,
        "bagging_seed" : 300,
        "verbosity" : -1,
        "max_depth": 1,
        "min_child_samples": 20,
        "save_binary": True}
mlgb = lgb.train(param, train_data, 500000, valid_sets = [val_data], early_stopping_rounds = 4000, verbose_eval=2000)

Training until validation scores don't improve for 4000 rounds.
[2000]	valid_0's auc: 0.654532
[4000]	valid_0's auc: 0.656064
[6000]	valid_0's auc: 0.657139
[8000]	valid_0's auc: 0.657487
[10000]	valid_0's auc: 0.656973
[12000]	valid_0's auc: 0.657571
[14000]	valid_0's auc: 0.657525
[16000]	valid_0's auc: 0.657316
[18000]	valid_0's auc: 0.65796
[20000]	valid_0's auc: 0.658329
[22000]	valid_0's auc: 0.657599
Early stopping, best iteration is:
[18787]	valid_0's auc: 0.658453
