In [1]:
# import libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from tqdm import tqdm

#
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import log_loss

In [38]:
# mean encoding
def target_mean_encoding(train, test, target, cols):
    train_copy = train.copy()
    train_copy['target'] = target
    kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2021)
    # loop
    for train_index, val_index in tqdm(kf.split(train, target)):
        train_x, val_x = train.iloc[train_index,:], train.iloc[val_index,:]
        train_y, val_y = target[train_index], target[val_index]
        train_x['target'] = train_y

        for col in cols:
            # calculate stats
            mean = train_x.groupby(col).target.mean()
            # mapping
            train.loc[val_index, col+'_target_encoded'] = val_x[col].map(mean)
            
    # for test set
    for col in cols:
        test[col +'_target_encoded'] = test[col].map(train_copy.groupby(col).target.mean())
    
    return train, test

# function for emi
def get_emi(p,r,n):
    e = ( p*r* ( (1+r)**n) ) / ( ((1+r)**n)-1 )
    return e

In [39]:
# import data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#cols_to_drop = ['ID', 'Loan Status', 'Payment Plan', 'Accounts Delinquent','Term', 
                #'Delinquency - two years', 'Grade']
xtrain = train.drop(['ID', 'Loan Status'], axis = 1)
xtest = test.drop(['ID', 'Loan Status'], axis = 1)
target = train['Loan Status'].values

print('Train shape', xtrain.shape)
print('Test shape', xtest.shape)

Train shape (67463, 33)
Test shape (28913, 33)


In [40]:
# open_acc by total_acc
xtrain['credit_line_ratio'] = xtrain['Open Account'].values / xtrain['Total Accounts'].values
xtest['credit_line_ratio'] = xtest['Open Account'].values / xtest['Total Accounts'].values

In [41]:
# loan amt by total curent balance
xtrain['balance_inc'] = xtrain['Loan Amount'].values / xtrain['Total Current Balance'].values
xtest['balance_inc'] = xtest['Loan Amount'].values / xtest['Total Current Balance'].values

In [42]:
# get emi
train_emi = get_emi(xtrain['Loan Amount'].values,xtrain['Interest Rate'].values/12/100, xtrain['Term'].values)
test_emi = get_emi(xtest['Loan Amount'].values,xtest['Interest Rate'].values/12/100, xtest['Term'].values)

# emi by loan amount
xtrain['inst_loan_ratio'] = train_emi / xtrain['Loan Amount'].values
xtest['inst_loan_ratio'] = test_emi / xtest['Loan Amount'].values

In [43]:
# emp duration in years
xtrain['emp_dur_in_years'] = [np.int((i/365)/12) for i in xtrain['Home Ownership']]
xtest['emp_dur_in_years'] = [np.int((i/365)/12) for i in xtest['Home Ownership']]

# cap value
tr_idx = xtrain[xtrain.emp_dur_in_years > 50].index
te_idx = xtest[xtest.emp_dur_in_years > 50].index
xtrain.loc[tr_idx, 'emp_dur_in_years'] = 51
xtest.loc[te_idx,'emp_dur_in_years'] = 51

In [44]:
# label encoding
# Label encode cat var
cat_cols = ['Batch Enrolled','Sub Grade','Loan Title', 'Employment Duration',
            'Initial List Status','Verification Status','Application Type','emp_dur_in_years']

for col in tqdm(cat_cols):
    le = LabelEncoder()
    xtrain[col] = le.fit_transform(xtrain[col])
    xtest[col] = le.transform(xtest[col])

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 10.17it/s]


In [45]:
# target encoding
cols_for_target_encoding = ['Batch Enrolled', 'Sub Grade', 'Loan Title','emp_dur_in_years']
xtrain, xtest = target_mean_encoding(xtrain, xtest, target, cols_for_target_encoding)

5it [00:00,  6.01it/s]


In [46]:
# drop unused columns
cols_to_drop = ['Grade','Payment Plan','Delinquency - two years','Inquires - six months',
                'Collection 12 months Medical','Application Type','Accounts Delinquent',
                'Term','Home Ownership','Employment Duration','emp_dur_in_years',
               'Batch Enrolled', 'Sub Grade', 'Loan Title']
        
xtrain = xtrain.drop(cols_to_drop, axis = 1)
xtest = xtest.drop(cols_to_drop, axis = 1)
print(f"{len(cols_to_drop)} columns droped")

14 columns droped


In [47]:
# define model
def run_model(train, target, val, test, trees, depth, seed):
    param = {'n_estimators' : trees, "max_depth" : depth, 
             "random_state" : seed, "n_jobs" : -1}
    
    rf = RandomForestClassifier(**param)
    rf.fit(train, target)
    
    v_pred = rf.predict_proba(val)[:,1]
    t_pred = rf.predict_proba(test)[:,1]
    
    return v_pred, t_pred

In [48]:
# Train model
FOLDS = 5
bags = 3
loss = []
test_pred = 0

skf = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = 42)
for fold, (t_,v_) in enumerate(skf.split(xtrain, target), 1):
    train_x, val_x = xtrain.iloc[t_,:], xtrain.iloc[v_,:]
    train_y, val_y = target[t_], target[v_]
    bag_val = 0
    bag_test = 0
    for (trees,depth,seed) in [[456, 5, 42], [500, 6, 21], [600, 7, 84]]:
        v_pred, t_pred = run_model(train_x, train_y, val_x, xtest, trees, depth, seed)
        bag_val += v_pred
        bag_test += t_pred
    bag_val = bag_val/bags
    bag_test = bag_test/bags
    val_log_loss = log_loss(val_y, bag_val)
    loss.append(val_log_loss)
    test_pred += bag_test
    print(f"Fold : {fold}/{FOLDS}, Validation Log Loss : {val_log_loss}")
print('...................','\n')
test_pred = test_pred/FOLDS
print(f"Average log loss on val data : {np.mean(loss)}, Std dev : {np.std(loss)}")
print('Done...................')

Fold : 1/5, Validation Log Loss : 0.30795511275957793
Fold : 2/5, Validation Log Loss : 0.3078670162612225
Fold : 3/5, Validation Log Loss : 0.30799979405731487
Fold : 4/5, Validation Log Loss : 0.3077773922358681
Fold : 5/5, Validation Log Loss : 0.3077855558585032
................... 

Average log loss on val data : 0.30787697423449734, Std dev : 8.895323842429187e-05
Done...................


In [49]:
# make submission
sub = pd.read_csv('submission.csv')
sub['Loan Status'] = test_pred
sub.to_csv('bag_forest_13_dec.csv', index = False)
sub.head()

Unnamed: 0,Loan Status
0,0.092169
1,0.083151
2,0.088253
3,0.087059
4,0.093259
