In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc

from catboost import CatBoostClassifier

from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV

In [0]:
def write_submission(predictions, file_name):
  sub = pd.read_csv('data/sample_submission_24jSKY6.csv')
  sub.loan_default = predictions
  sub.to_csv(file_name, index=False)

In [0]:
train_df = pd.read_csv('data/LTFS_train_processed.csv')
test_df = pd.read_csv('data/LTFS_test_processed.csv')

In [0]:
y = train_df.loan_default
train_df.drop('loan_default', axis=1, inplace=True)

## Adding features

In [0]:
cns_perform_dict = train_df['PERFORM_CNS.SCORE.DESCRIPTION'].value_counts().to_dict()
for i, (key, _) in enumerate(cns_perform_dict.items()):
  cns_perform_dict[key] = i
  
no_history_not_scored = [0, 19, 15, 16, 14, 18, 12]
low_risk = [2, 4, 1, 3, 9, 6, 11]
high_risk = [13, 7, 17, 5, 8, 10]

train_df['PERFORM_CNS.SCORE.DESCRIPTION'] = train_df['PERFORM_CNS.SCORE.DESCRIPTION'].replace(cns_perform_dict)
test_df['PERFORM_CNS.SCORE.DESCRIPTION'] = test_df['PERFORM_CNS.SCORE.DESCRIPTION'].replace(cns_perform_dict)

train_df['Score_des_no_his_not_score'] = train_df['PERFORM_CNS.SCORE.DESCRIPTION'].isin(no_history_not_scored).astype(int)
train_df['Score_des_low_risk'] = train_df['PERFORM_CNS.SCORE.DESCRIPTION'].isin(low_risk).astype(int)
train_df['Score_des_high_risk'] = train_df['PERFORM_CNS.SCORE.DESCRIPTION'].isin(high_risk).astype(int)

test_df['Score_des_no_his_not_score'] = test_df['PERFORM_CNS.SCORE.DESCRIPTION'].isin(no_history_not_scored).astype(int)
test_df['Score_des_low_risk'] = test_df['PERFORM_CNS.SCORE.DESCRIPTION'].isin(low_risk).astype(int)
test_df['Score_des_high_risk'] = test_df['PERFORM_CNS.SCORE.DESCRIPTION'].isin(high_risk).astype(int)

In [0]:
train_df['AssetCost2Disbursed'] = train_df.asset_cost/train_df.disbursed_amount
test_df['AssetCost2Disbursed'] = test_df.asset_cost/test_df.disbursed_amount

In [0]:
train_df['dif_pri_san_dis'] = train_df['PRI.SANCTIONED.AMOUNT'] - train_df['PRI.DISBURSED.AMOUNT']
test_df['dif_pri_san_dis'] = test_df['PRI.SANCTIONED.AMOUNT'] - test_df['PRI.DISBURSED.AMOUNT']

In [0]:
train_df['ltv_log'] = np.log1p(train_df['ltv'])
test_df['ltv_log'] = np.log1p(test_df['ltv'])

In [0]:
train_df['diff_cur_bal_disb_amt'] = (train_df['PRI.CURRENT.BALANCE'] + train_df['SEC.CURRENT.BALANCE']) - (train_df['PRI.DISBURSED.AMOUNT'] + train_df['SEC.DISBURSED.AMOUNT'])
test_df['diff_cur_bal_disb_amt'] = (test_df['PRI.CURRENT.BALANCE'] + test_df['SEC.CURRENT.BALANCE']) - (test_df['PRI.DISBURSED.AMOUNT'] + test_df['SEC.DISBURSED.AMOUNT'])

In [0]:
train_df['asset2disbursed_log'] = np.log1p(train_df['AssetCost2Disbursed'])
test_df['asset2disbursed_log'] = np.log1p(test_df['AssetCost2Disbursed'])

In [0]:
# cols = ['PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT',
#         'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT']

In [0]:
# for col in cols:
#   train_df[col] = np.log1p(train_df[col])
#   test_df[col] = np.log1p(test_df[col])

In [0]:
# train_df['NO.OF_INQUIRIES'] = train_df['NO.OF_INQUIRIES'].apply(lambda x: 20 if x>20 else x)
# test_df['NO.OF_INQUIRIES'] = test_df['NO.OF_INQUIRIES'].apply(lambda x: 20 if x>20 else x)

In [0]:
# train_df['dif_pri_san_dis'] = train_df['PRI.SANCTIONED.AMOUNT'] - train_df['PRI.DISBURSED.AMOUNT']
# test_df['dif_pri_san_dis'] = test_df['PRI.SANCTIONED.AMOUNT'] - test_df['PRI.DISBURSED.AMOUNT']

In [0]:
# train_df['Combined_ID'] = (train_df['manufacturer_id'].map(str)+train_df['branch_id'].map(str) + train_df['supplier_id'].map(str)).astype(int)
# test_df['Combined_ID'] = (test_df['manufacturer_id'].map(str)+test_df['branch_id'].map(str)+ test_df['supplier_id'].map(str)).astype(int)

# train_df['Combined_ID'] = np.log1p(train_df.Combined_ID)
# test_df['Combined_ID'] = np.log1p(test_df.Combined_ID)

In [0]:
# train_df['diff_newacc_deliquent'] = train_df['NEW.ACCTS.IN.LAST.SIX.MONTHS'] - train_df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS']
# test_df['diff_newacc_deliquent'] = test_df['NEW.ACCTS.IN.LAST.SIX.MONTHS'] - test_df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS']

In [0]:
# train_df['Disbursal'] = train_df.apply(lambda row: datetime(row['yr_Disbursal'], row['mon_Disbursal'], row['day_Disbursal']), axis=1)
# train_df.Disbursal = pd.to_datetime(train_df.Disbursal, format='%Y-%m-%d')

# test_df['Disbursal'] = test_df.apply(lambda row: datetime(row['yr_Disbursal'], row['mon_Disbursal'], row['day_Disbursal']), axis=1)
# test_df.Disbursal = pd.to_datetime(test_df.Disbursal, format='%Y-%m-%d')

# train_df['dob'] = train_df.apply(lambda row: datetime(row['yr_dob'], row['mon_dob'], row['day_dob']), axis=1)
# train_df.Disbursal = pd.to_datetime(train_df.Disbursal, format='%Y-%m-%d')

# test_df['dob'] = test_df.apply(lambda row: datetime(row['yr_dob'], row['mon_dob'], row['day_dob']), axis=1)
# test_df.Disbursal = pd.to_datetime(test_df.Disbursal, format='%Y-%m-%d')

# train_df['age_atDisbursal'] = np.floor((train_df.Disbursal - train_df.dob).dt.days / 365.25).astype(int)
# test_df['age_atDisbursal'] = np.floor((test_df.Disbursal - test_df.dob).dt.days / 365.25).astype(int)

# train_df.drop(['dob', 'Disbursal'], axis=1, inplace=True)
# test_df.drop(['dob', 'Disbursal'], axis=1, inplace=True)

In [0]:
# train_df['age<25'] = np.where(train_df.age_atDisbursal<25, 1, 0)
# test_df['age<25'] = np.where(test_df.age_atDisbursal<25, 1, 0)

In [0]:
# train_df['age>50'] = np.where(train_df.age_atDisbursal>50, 1, 0)
# test_df['age>50'] = np.where(test_df.age_atDisbursal>50, 1, 0)

In [0]:
# # train_df['total_account'] = train_df['PRI.NO.OF.ACCTS'] + train_df['SEC.NO.OF.ACCTS']
# # test_df['total_account'] = test_df['PRI.NO.OF.ACCTS'] + test_df['SEC.NO.OF.ACCTS']

# # train_df['total_active_acct'] = train_df['PRI.ACTIVE.ACCTS'] + train_df['SEC.ACTIVE.ACCTS']
# # test_df['total_active_acct'] = test_df['PRI.ACTIVE.ACCTS'] + test_df['SEC.ACTIVE.ACCTS']

# train_df['total_disbursed'] = train_df['PRI.DISBURSED.AMOUNT'] + train_df['SEC.DISBURSED.AMOUNT']
# test_df['total_disbursed'] = test_df['PRI.DISBURSED.AMOUNT'] + test_df['SEC.DISBURSED.AMOUNT']

# # train_df['total_install_amt'] = train_df['PRIMARY.INSTAL.AMT'] + train_df['SEC.INSTAL.AMT']
# # test_df['total_install_amt'] = test_df['PRIMARY.INSTAL.AMT'] + test_df['SEC.INSTAL.AMT']

# # train_df['total_overdue_accts'] = train_df['PRI.OVERDUE.ACCTS'] + train_df['SEC.OVERDUE.ACCTS']
# # test_df['total_overdue_accts'] = test_df['PRI.OVERDUE.ACCTS'] + test_df['SEC.OVERDUE.ACCTS']

# train_df['total_current_bal'] = train_df['PRI.CURRENT.BALANCE'] + train_df['SEC.CURRENT.BALANCE']
# test_df['total_current_bal'] = test_df['PRI.CURRENT.BALANCE'] + test_df['SEC.CURRENT.BALANCE']

In [0]:
# train_df['diff_total_acc_overdue_acc'] = train_df['PRI.NO.OF.ACCTS'] + train_df['SEC.NO.OF.ACCTS'] - (train_df['PRI.OVERDUE.ACCTS'] + train_df['SEC.OVERDUE.ACCTS'])
# test_df['diff_total_acc_overdue_acc'] = test_df['PRI.NO.OF.ACCTS'] + test_df['SEC.NO.OF.ACCTS'] - (test_df['PRI.OVERDUE.ACCTS'] + test_df['SEC.OVERDUE.ACCTS'])

In [0]:
# emp_code_train = train_df['Employee_code_ID'].value_counts().to_dict()
# emp_code_test = test_df['Employee_code_ID'].value_counts().to_dict()

# train_df['emp_code_count'] = train_df['Employee_code_ID'].replace(emp_code_train)
# test_df['emp_code_count'] = test_df['Employee_code_ID'].replace(emp_code_test)

# train_df['emp_code_count'] = np.log(train_df['emp_code_count'])
# test_df['emp_code_count'] = np.log(test_df['emp_code_count'])

In [0]:
train_df.shape, test_df.shape

((233154, 50), (112392, 50))

In [0]:
pd.DataFrame(train_df.dtypes).reset_index()

Unnamed: 0,index,0
0,disbursed_amount,int64
1,asset_cost,int64
2,ltv,float64
3,branch_id,int64
4,supplier_id,int64
5,manufacturer_id,int64
6,Current_pincode_ID,int64
7,Employment.Type,object
8,State_ID,int64
9,Employee_code_ID,int64


In [0]:
cat_cols = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 42, 43, 44]

In [0]:
X_train, X_val, y_train, y_val = train_test_split(train_df, y, test_size=0.2, random_state=42,
                                                  stratify=y)

In [0]:
model = CatBoostClassifier(loss_function='Logloss', eval_metric='AUC', use_best_model=True,
                           task_type='GPU', cat_features=cat_cols)

model.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=cat_cols, verbose_eval=50)

Learning rate set to 0.135107
0:	learn: 0.6216486	test: 0.6276269	best: 0.6276269 (0)	total: 65ms	remaining: 1m 4s
50:	learn: 0.6778487	test: 0.6718933	best: 0.6718933 (50)	total: 2.15s	remaining: 40s
100:	learn: 0.6867274	test: 0.6756375	best: 0.6756375 (100)	total: 4.06s	remaining: 36.2s
150:	learn: 0.6929192	test: 0.6765684	best: 0.6765684 (150)	total: 6s	remaining: 33.8s
200:	learn: 0.6977981	test: 0.6770542	best: 0.6770542 (200)	total: 7.92s	remaining: 31.5s
250:	learn: 0.7021741	test: 0.6774237	best: 0.6774927 (242)	total: 9.88s	remaining: 29.5s
300:	learn: 0.7069107	test: 0.6773170	best: 0.6774927 (242)	total: 12s	remaining: 27.8s
350:	learn: 0.7110757	test: 0.6775612	best: 0.6775881 (346)	total: 14.1s	remaining: 26.1s
400:	learn: 0.7152334	test: 0.6776438	best: 0.6778170 (381)	total: 16.2s	remaining: 24.2s
450:	learn: 0.7190410	test: 0.6778316	best: 0.6778609 (430)	total: 18.4s	remaining: 22.4s
500:	learn: 0.7230568	test: 0.6780870	best: 0.6781009 (493)	total: 20.5s	remaining: 

<catboost.core.CatBoostClassifier at 0x7f5e83dcd128>

In [0]:
feature_imp = pd.DataFrame(model.feature_importances_, index=train_df.columns)
feature_imp[0].sort_values(ascending=False)

Employee_code_ID                       6.542621
branch_id                              5.672917
supplier_id                            5.670748
disbursed_amount                       4.814768
PERFORM_CNS.SCORE.DESCRIPTION          4.561780
Current_pincode_ID                     4.356893
ltv_log                                4.045778
day_Disbursal                          4.040173
ltv                                    3.884354
yr_dob                                 3.507759
mon_Disbursal                          3.250201
diff_cur_bal_disb_amt                  3.212142
dif_pri_san_dis                        2.850155
State_ID                               2.766355
manufacturer_id                        2.723345
PRIMARY.INSTAL.AMT                     2.542143
NO.OF_INQUIRIES                        2.357049
InMonths_CREDIT.HISTORY.LENGTH         2.342987
PRI.NO.OF.ACCTS                        2.295464
asset_cost                             2.278757
Score_des_high_risk                    2

In [0]:
best_params = {'bagging_temperature': 0.41010395885331385,
 'border_count': 186,
 'depth': 9,
 'iterations': 500,
 'l2_leaf_reg': 21,
 'learning_rate': 0.0673344419215237,
 'random_strength': 3.230824361824754e-06,
 'scale_pos_weight': 0.7421091918485163}

## Generating OOF and submission

In [0]:
%%time
num_folds = 5
predictions = pd.DataFrame()
print('Training Model:')
val_list = []
clf = CatBoostClassifier(loss_function='Logloss', eval_metric='AUC', **best_params, task_type='GPU', cat_features=cat_cols)
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
oof = np.zeros(len(train_df))
train_score = []
valid_score = []

for fold_, (trn_idx, val_idx) in enumerate(skf.split(train_df.values, y.values)):
    gc.collect()
    print("Fold idx:{}".format(fold_ + 1))
    
    X_train, y_train = train_df.iloc[trn_idx], y[trn_idx]
    X_valid, y_valid = train_df.iloc[val_idx], y[val_idx]

    clf.fit(X=X_train, y=y_train, eval_set=[(X_valid, y_valid)], verbose_eval=50)  
    
    oof[val_idx] = clf.predict_proba(train_df.iloc[val_idx])[:,1]
    
    train_score.append(roc_auc_score(y_train, clf.predict_proba(train_df.iloc[trn_idx])[:,1]))
    valid_score.append(roc_auc_score(y_valid, oof[val_idx]))
    
    predictions[f'Folds_{fold_+1}'] = clf.predict_proba(test_df)[:,1]  

del clf  
val_list.append(oof)

Training Model:
Fold idx:1
0:	learn: 0.6352836	test: 0.6401085	best: 0.6401085 (0)	total: 160ms	remaining: 1m 20s
50:	learn: 0.6889104	test: 0.6774121	best: 0.6774121 (50)	total: 4.23s	remaining: 37.2s
100:	learn: 0.7027241	test: 0.6809873	best: 0.6809873 (100)	total: 8.32s	remaining: 32.9s
150:	learn: 0.7122634	test: 0.6824895	best: 0.6825072 (149)	total: 12.3s	remaining: 28.5s
200:	learn: 0.7201914	test: 0.6833728	best: 0.6833728 (200)	total: 16.3s	remaining: 24.2s
250:	learn: 0.7275209	test: 0.6834615	best: 0.6834621 (248)	total: 20.1s	remaining: 20s
300:	learn: 0.7330846	test: 0.6839073	best: 0.6839154 (297)	total: 23.9s	remaining: 15.8s
350:	learn: 0.7392866	test: 0.6841117	best: 0.6841117 (350)	total: 27.8s	remaining: 11.8s
400:	learn: 0.7453569	test: 0.6841511	best: 0.6842352 (369)	total: 31.6s	remaining: 7.8s
450:	learn: 0.7511865	test: 0.6840819	best: 0.6842352 (369)	total: 35.4s	remaining: 3.84s
499:	learn: 0.7566403	test: 0.6839980	best: 0.6842352 (369)	total: 39.3s	remainin

In [0]:
roc = roc_auc_score(y, oof)
roc

0.6822050695597501

In [0]:
train_roc = sum(train_score)/len(train_score)
valid_roc = sum(valid_score)/len(valid_score)
train_roc, valid_roc

(0.7564691167997767, 0.6822551737483227)

In [0]:
name = 'Salil'
VERSION = 5
oof_train = pd.DataFrame()
oof_train[f'oof_cb_{name}_{VERSION}'] = oof
oof_test = predictions

In [0]:
oof_train.to_csv(f'train_cb_roc_{roc}_v{VERSION}_{name}.csv',index=False)
oof_test.to_csv(f'test_cb_roc_{roc}_v{VERSION}_{name}.csv', index=False)

In [0]:
pred = np.mean(predictions, axis=1)

In [0]:
write_submission(pred, f'mean_cb_{roc*100}.csv')