In [24]:

from time import gmtime, strftime
import gc
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import *
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix,roc_auc_score,roc_curve,precision_score, recall_score, accuracy_score
import seaborn as sns

from xgboost import XGBRFClassifier
import lightgbm as lgb
from imblearn.over_sampling import SMOTE, ADASYN

In [25]:

train_data = pd.read_csv('E:/ML_IP/ML_projects/datastorm/credit_card_default_train.csv')
test_data = pd.read_csv('E:/ML_IP/ML_projects/datastorm/credit_card_default_test.csv')

In [26]:
cat_cols = ['Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE','PAY_JULY','PAY_AUG','PAY_SEP','PAY_OCT','PAY_NOV','PAY_DEC']
target = 'NEXT_MONTH_DEFAULT'
ID = 'Client_ID'
num_cols = [col for col in train_data.columns.tolist() if col not in cat_cols +[target]+[ID]]

In [27]:
def Paid_Due_July(row):
    if row['PAID_AMT_JULY'] == 0:
        val = row['DUE_AMT_JULY']
    else:
        val = row['DUE_AMT_JULY']/row['PAID_AMT_JULY']
    return val

def Paid_Due_Aug(row):
    if row['PAID_AMT_AUG'] == 0:
        val = row['DUE_AMT_AUG']
    else:
        val = row['DUE_AMT_AUG']/row['PAID_AMT_AUG']
    return val

def Paid_Due_Sep(row):
    if row['PAID_AMT_SEP'] == 0:
        val = row['DUE_AMT_SEP']
    else:
        val = row['DUE_AMT_SEP']/row['PAID_AMT_SEP']
    return val

def Paid_Due_Oct(row):
    if row['PAID_AMT_OCT'] == 0:
        val = row['DUE_AMT_OCT']
    else:
        val = row['DUE_AMT_OCT']/row['PAID_AMT_OCT']
    return val

def Paid_Due_Nov(row):
    if row['PAID_AMT_NOV'] == 0:
        val = row['DUE_AMT_NOV']
    else:
        val = row['DUE_AMT_NOV']/row['PAID_AMT_NOV']
    return val

def Paid_Due_Dec(row):
    if row['PAID_AMT_DEC'] == 0:
        val = row['DUE_AMT_DEC']
    else:
        val = row['DUE_AMT_DEC']/row['PAID_AMT_DEC']
    return val

In [28]:
train_data['PAID_DUE_JULY'] = train_data.apply(Paid_Due_July, axis=1)
train_data['PAID_DUE_AUG'] = train_data.apply(Paid_Due_Aug, axis=1)
train_data['PAID_DUE_SEP'] = train_data.apply(Paid_Due_Sep, axis=1)
train_data['PAID_DUE_OCT'] = train_data.apply(Paid_Due_Oct, axis=1)
train_data['PAID_DUE_NOV'] = train_data.apply(Paid_Due_Nov, axis=1)
train_data['PAID_DUE_DEC'] = train_data.apply(Paid_Due_Dec, axis=1)

test_data['PAID_DUE_JULY'] = test_data.apply(Paid_Due_July, axis=1)
test_data['PAID_DUE_AUG'] = test_data.apply(Paid_Due_Aug, axis=1)
test_data['PAID_DUE_SEP'] = test_data.apply(Paid_Due_Sep, axis=1)
test_data['PAID_DUE_OCT'] = test_data.apply(Paid_Due_Oct, axis=1)
test_data['PAID_DUE_NOV'] = test_data.apply(Paid_Due_Nov, axis=1)
test_data['PAID_DUE_DEC'] = test_data.apply(Paid_Due_Dec, axis=1)

In [29]:
train_data['PAY_TOT'] = train_data['PAY_JULY'] + train_data['PAY_AUG'] + train_data['PAY_SEP'] + train_data['PAY_OCT'] + train_data['PAY_NOV'] + train_data['PAY_DEC']
test_data['PAY_TOT'] = test_data['PAY_JULY'] + test_data['PAY_AUG'] + test_data['PAY_SEP'] + test_data['PAY_OCT'] + test_data['PAY_NOV'] + test_data['PAY_DEC']

In [30]:
def isZero(row):
    if row['PAY_TOT'] == 0:
        val = 1
    else:
        val = 0
    return val

In [31]:
def isZero(row):
    if row['PAY_TOT'] == 0:
        val = 1
    else:
        val = 0
    return val

In [32]:
# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')

# passing bridge-types-cat column (label encoded values of bridge_types)

train_df = pd.DataFrame(enc.fit_transform(train_data[['Balance_Limit_V1','EDUCATION_STATUS','MARITAL_STATUS','AGE','Gender']]).toarray())

enc = OneHotEncoder(handle_unknown='ignore')

# passing bridge-types-cat column (label encoded values of bridge_types)

test_df = pd.DataFrame(enc.fit_transform(test_data[['Balance_Limit_V1','EDUCATION_STATUS','MARITAL_STATUS','AGE','Gender']]).toarray())

In [33]:
train_data = pd.concat([train_data,train_df],axis=1)
test_data = pd.concat([test_data,test_df],axis=1)
x_train, x_test, y_train, y_test = train_test_split(train_data.drop([target,ID,'Balance_Limit_V1','Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE'],axis=1), 
                                                    train_data[target], test_size=0.30)

In [38]:
d_train = lgb.Dataset(x_train, label=y_train)
params_best = {}
params_best['learning_rate'] = 0.1
params_best['boosting_type'] = 'dart'
params_best['objective'] = 'binary'
params_best['metric'] = 'binary_logloss'
params_best['sub_feature'] = 0.7
params_best['num_leaves'] = 18
params_best['min_data'] = 70
params_best['max_depth'] = 510
params_best['max_bin']=120
params_best['n_estimators']=110
params_best['colsample_bytree' ]=0


clf = lgb.train(params_best, d_train, 100)

#Prediction
test_probs = clf.predict(x_test)


def avoid_prob(preds_lgb):
    for i in range(len(preds_lgb)):
        if preds_lgb[i]>=0.5:       # setting threshold to .5
            preds_lgb[i]=1
        else:
            preds_lgb[i]=0
    return preds_lgb

test_preds=avoid_prob(test_probs)
train_preds=avoid_prob(clf.predict(x_train))

from sklearn.metrics import classification_report, confusion_matrix,roc_auc_score,roc_curve

print(classification_report(y_train,train_preds))
print ('\n')
print(confusion_matrix(y_train,train_preds))
#####
test_preds=avoid_prob(test_probs)

from sklearn.metrics import classification_report, confusion_matrix,roc_auc_score,roc_curve

print(classification_report(y_test,test_preds))
print ('\n')
print(confusion_matrix(y_test,test_preds))

print("Test_Accuracy = {}".format(accuracy_score(y_test, test_preds)))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89     13029
           1       0.70      0.38      0.49      3771

    accuracy                           0.82     16800
   macro avg       0.77      0.67      0.69     16800
weighted avg       0.81      0.82      0.80     16800



[[12418   611]
 [ 2334  1437]]
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      5641
           1       0.67      0.37      0.47      1559

    accuracy                           0.82      7200
   macro avg       0.76      0.66      0.68      7200
weighted avg       0.81      0.82      0.80      7200



[[5357  284]
 [ 985  574]]
Test_Accuracy = 0.82375


In [39]:
#use gridsearchcv

mdl = lgb.LGBMClassifier(boosting_type= 'gbdt', 
          objective = 'binary', 
          n_jobs = 5, 
          silent = True,
          max_depth = 10)

# To view the default model parameters:
mdl.get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq'])

In [40]:
params = {}
params['learning_rate'] = [0.1]
params['boosting_type'] = ['dart']
params['objective'] = ['binary']
params['metric'] = ['binary_logloss']
params['sub_feature'] = [0.7]
params['num_leaves'] = [18]
params['min_data'] = [70]
params['max_depth'] = [510]
params['max_bin']=[120]
params['n_estimators']=[110]


grid = GridSearchCV(mdl, params, verbose=1, cv=4, n_jobs=-1)
# Run the grid
grid.fit(x_train,y_train)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)
probs=grid.predict(x_test)

Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:    3.8s remaining:    3.8s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    3.9s finished


{'boosting_type': 'dart', 'learning_rate': 0.1, 'max_bin': 120, 'max_depth': 510, 'metric': 'binary_logloss', 'min_data': 70, 'n_estimators': 110, 'num_leaves': 18, 'objective': 'binary', 'sub_feature': 0.7}
0.8172619047619047


In [41]:
params_best = {}
params_best['learning_rate'] = 0.1
params_best['boosting_type'] = 'dart'
params_best['objective'] = 'binary'
params_best['metric'] = 'binary_logloss'
params_best['sub_feature'] = 0.5
params_best['num_leaves'] = 18
params_best['min_data'] = 70
params_best['max_depth'] = 5
params_best['max_bin']=250
params_best['n_estimators']=110
params_best['colsample_bytree' ]=0
params_best['subsample']=0.000001

In [42]:
probs=grid.predict(test_data.drop([ID,'Balance_Limit_V1','Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE'],axis=1))
sample_submission = pd.DataFrame(columns=[ID,target])
sample_submission[ID]=test_data[ID]
sample_submission[target] = probs
sample_submission.set_index(ID)
submission = sample_submission.to_csv('data-storm-day2-2.csv',index = None)


#submission was in float32 format. It has been correcteed in this notebook