## Loading required Packages 
- for Machine Learning Algorithms and its Evaluation.

In [1]:
import pickle
import os
import sys
import math
import statistics
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, check_cv, cross_val_predict
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report, cohen_kappa_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomTreesEmbedding, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier, XGBRFClassifier


## Loading Data for Modeling

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train

Unnamed: 0,ID,Frequency,InstlmentMode,LoanStatus,PaymentMode,BranchID,Area,Tenure,AssetCost,AmountFinance,...,BUREAU_LOAN_TYPE,AVERAGE_LOAN_TYPE,ACCT_ACTIVE,ACCT_CLOSED,ACCT_DELINQUENT,ACCT_SETTLED AND RESTRUCTURED,ACCT_SUIT FILLED (WILFUL_DEFAULTER),ACCT_WRITTEN_OFF,CURRENT_LOAN_BAL,ACTIVE_LOAN_TENURE
0,1,Monthly,Arrear,Closed,PDC_E,1,,48,450000,275000.0,...,4,2.250000,2,2,2,1,1,1,643176.0,120.0
1,2,Monthly,Advance,Closed,PDC,333,BHOPAL,47,485000,350000.0,...,7,1.857143,2,2,1,1,1,1,10349457.0,565.0
2,3,Quatrly,Arrear,Active,Direct Debit,1,,68,690000,519728.0,...,8,3.875000,2,2,2,1,1,1,2335165.0,454.0
3,7,Monthly,Advance,Closed,Billed,125,GUNA,48,480000,400000.0,...,2,2.000000,2,2,1,1,1,1,624000.0,33.0
4,8,Monthly,Arrear,Closed,Billed,152,BILASPUR,44,619265,440000.0,...,3,2.333333,2,2,2,1,2,1,974119.0,236.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128650,143390,Half Yearly,Arrear,Closed,Direct Debit,424,PANIPAT,24,470000,265601.0,...,2,1.000000,1,1,1,1,1,1,1867630.0,121.0
128651,143391,Half Yearly,Arrear,Closed,Direct Debit,424,PANIPAT,24,460000,275630.0,...,1,1.000000,1,1,1,1,1,1,73890.0,0.0
128652,143393,Monthly,Arrear,Active,Direct Debit,424,PANIPAT,23,545000,300733.0,...,2,2.000000,2,2,1,1,1,1,244133.0,84.0
128653,143394,Half Yearly,Arrear,Active,Direct Debit,424,PANIPAT,35,350000,250962.0,...,1,1.000000,1,1,1,1,1,1,132487.0,0.0


In [4]:
test

Unnamed: 0,ID,Frequency,InstlmentMode,LoanStatus,PaymentMode,BranchID,Area,Tenure,AssetCost,AmountFinance,...,BUREAU_LOAN_TYPE,AVERAGE_LOAN_TYPE,ACCT_ACTIVE,ACCT_CLOSED,ACCT_DELINQUENT,ACCT_SETTLED AND RESTRUCTURED,ACCT_SUIT FILLED (WILFUL_DEFAULTER),ACCT_WRITTEN_OFF,CURRENT_LOAN_BAL,ACTIVE_LOAN_TENURE
0,4,Monthly,Advance,Closed,PDC_E,2,GUNA,46,480000,365000.0,...,4,1.0,2,2,1,1,1,1,2191274.0,240.0
1,5,Monthly,Advance,Closed,PDC,2,GUNA,45,480000,285000.0,...,5,1.6,2,2,1,1,1,1,907231.0,180.0
2,6,Quatrly,Arrear,Closed,PDC,2,GUNA,48,580000,400000.0,...,4,3.0,2,2,1,1,1,1,102800.0,169.0
3,25,Half Yearly,Arrear,Closed,Billed,154,,36,725000,500000.0,...,2,1.0,2,2,1,1,1,1,127593.0,0.0
4,119,Quatrly,Arrear,Closed,PDC,194,CUTTACK,48,617000,400000.0,...,2,1.0,2,2,1,1,1,1,116200.0,46.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14740,143396,Monthly,Arrear,Closed,PDC,143,NEW DELHI BARAKHAMBHA ROAD,35,530016,419616.0,...,6,6.5,2,2,1,1,1,1,12259418.0,397.0
14741,143397,Monthly,Arrear,Closed,PDC,32,BANGALORE LALBAGH,36,595000,446500.0,...,4,1.5,2,2,1,1,1,1,38441054.0,0.0
14742,143398,Monthly,Arrear,Closed,PDC,32,BANGALORE LALBAGH,36,595000,446500.0,...,4,1.5,2,2,1,1,1,1,38441054.0,0.0
14743,143399,Monthly,Arrear,Closed,PDC_E,246,BIKANER,12,400000,280000.0,...,4,4.0,2,2,1,1,1,1,613508.0,0.0


#### Dropping Unwanted features & Changing dtype of some features as

In [None]:
# # Dropping Area & City columns instead these columns we can use Branch ID & Zipcode.
# train.drop(["Area","AmountFinance", "DisbursalDate", "MaturityDAte", "AuthDate", "SupplierID", "City", "ZiPCODE"], axis = 1, inplace = True)
# test.drop(["Area", "AmountFinance", "DisbursalDate", "MaturityDAte", "AuthDate", "SupplierID", "City", "ZiPCODE"], axis = 1, inplace = True)


# Baseline Model

#### Applying Label Encoder for categorical features

In [60]:
# Object type Columns as -
cat_col = ['Frequency', 'InstlmentMode', 'LoanStatus', 'PaymentMode', 'Area', 'DisbursalDate', 'MaturityDAte', 'AuthDate'
          ,'SEX', 'City', 'State', 'Top-up Month']

le = LabelEncoder().fit_transform # loading Label Encoder

train[cat_col]= train[cat_col].apply(le) # applying Lable Encoder on Object type columns only.
# y[target]= y[target].apply(le)
train

Unnamed: 0,ID,Frequency,InstlmentMode,LoanStatus,PaymentMode,BranchID,Area,Tenure,AssetCost,AmountFinance,...,BUREAU_LOAN_TYPE,AVERAGE_LOAN_TYPE,ACCT_ACTIVE,ACCT_CLOSED,ACCT_DELINQUENT,ACCT_SETTLED AND RESTRUCTURED,ACCT_SUIT FILLED (WILFUL_DEFAULTER),ACCT_WRITTEN_OFF,CURRENT_LOAN_BAL,ACTIVE_LOAN_TENURE
0,1,2,1,1,9,1,92,48,450000,275000.0,...,4,2.250000,2,2,2,1,1,1,643176.0,120.0
1,2,2,0,1,7,333,15,47,485000,350000.0,...,7,1.857143,2,2,1,1,1,1,10349457.0,565.0
2,3,3,1,0,3,1,92,68,690000,519728.0,...,8,3.875000,2,2,2,1,1,1,2335165.0,454.0
3,7,2,0,1,1,125,29,48,480000,400000.0,...,2,2.000000,2,2,1,1,1,1,624000.0,33.0
4,8,2,1,1,1,152,18,44,619265,440000.0,...,3,2.333333,2,2,2,1,2,1,974119.0,236.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128650,143390,1,1,1,3,424,66,24,470000,265601.0,...,2,1.000000,1,1,1,1,1,1,1867630.0,121.0
128651,143391,1,1,1,3,424,66,24,460000,275630.0,...,1,1.000000,1,1,1,1,1,1,73890.0,0.0
128652,143393,2,1,0,3,424,66,23,545000,300733.0,...,2,2.000000,2,2,1,1,1,1,244133.0,84.0
128653,143394,1,1,0,3,424,66,35,350000,250962.0,...,1,1.000000,1,1,1,1,1,1,132487.0,0.0


#### Spliting df1 (train_demo) into X & y respectively

In [61]:
X = train.drop('Top-up Month', axis = 1)
y = train['Top-up Month']

print(X.shape, y.shape)

(128655, 36) (128655,)


In [63]:
X

Unnamed: 0,ID,Frequency,InstlmentMode,LoanStatus,PaymentMode,BranchID,Area,Tenure,AssetCost,AmountFinance,...,BUREAU_LOAN_TYPE,AVERAGE_LOAN_TYPE,ACCT_ACTIVE,ACCT_CLOSED,ACCT_DELINQUENT,ACCT_SETTLED AND RESTRUCTURED,ACCT_SUIT FILLED (WILFUL_DEFAULTER),ACCT_WRITTEN_OFF,CURRENT_LOAN_BAL,ACTIVE_LOAN_TENURE
0,1,2,1,1,9,1,92,48,450000,275000.0,...,4,2.250000,2,2,2,1,1,1,643176.0,120.0
1,2,2,0,1,7,333,15,47,485000,350000.0,...,7,1.857143,2,2,1,1,1,1,10349457.0,565.0
2,3,3,1,0,3,1,92,68,690000,519728.0,...,8,3.875000,2,2,2,1,1,1,2335165.0,454.0
3,7,2,0,1,1,125,29,48,480000,400000.0,...,2,2.000000,2,2,1,1,1,1,624000.0,33.0
4,8,2,1,1,1,152,18,44,619265,440000.0,...,3,2.333333,2,2,2,1,2,1,974119.0,236.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128650,143390,1,1,1,3,424,66,24,470000,265601.0,...,2,1.000000,1,1,1,1,1,1,1867630.0,121.0
128651,143391,1,1,1,3,424,66,24,460000,275630.0,...,1,1.000000,1,1,1,1,1,1,73890.0,0.0
128652,143393,2,1,0,3,424,66,23,545000,300733.0,...,2,2.000000,2,2,1,1,1,1,244133.0,84.0
128653,143394,1,1,0,3,424,66,35,350000,250962.0,...,1,1.000000,1,1,1,1,1,1,132487.0,0.0


In [64]:
y

0         0
1         6
2         1
3         0
4         5
         ..
128650    3
128651    6
128652    6
128653    6
128654    6
Name: Top-up Month, Length: 128655, dtype: int32

#### Splitting train_demo dataset

In [71]:
# # Spliting encoded X set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
print(f"X_train - {X_train.shape}, X_test - { X_test.shape}, X_val - {X_val.shape}")
print(f"y_train - {y_train.shape}, y_test - {y_test.shape}, y_val - {y_val.shape}")

X_train - (77193, 36), X_test - (25731, 36), X_val - (25731, 36)
y_train - (77193,), y_test - (25731,), y_val - (25731,)


**LOGISTIC REGRESSION**

In [72]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

log_pred = log_reg.predict(X_test)

log_f1_score = f1_score(y_test, log_pred, average='macro')
log_acc = accuracy_score(y_test, log_pred)
lg_confu = confusion_matrix(y_test, log_pred)
log_class_report = classification_report(y_test, log_pred)
log_kappa = cohen_kappa_score(y_test, log_pred)

In [73]:
print(f"Logistic Regression F1 Score - {log_f1_score}.\n")
print(f"Logistic Regression Accuracy - {log_acc}.\n")
print(f"Logistic Regression Confussion Metrics - \n{log_class_report}")
print(f"Logistic Regression Cohen-Kappa Score - \n{log_kappa}")

Logistic Regression F1 Score - 0.13125046876572935.

Logistic Regression Accuracy - 0.8245307216975633.

Logistic Regression Confussion Metrics - 
              precision    recall  f1-score   support

           0       0.25      0.00      0.01      1677
           1       0.00      0.00      0.00       215
           2       0.00      0.00      0.00       492
           3       0.00      0.00      0.00       692
           4       0.00      0.00      0.00       634
           5       0.15      0.00      0.01       750
           6       0.83      1.00      0.90     21271

    accuracy                           0.82     25731
   macro avg       0.18      0.14      0.13     25731
weighted avg       0.70      0.82      0.75     25731

Logistic Regression Cohen-Kappa Score - 
0.0016562600293184015


https://stackoverflow.com/questions/43676905/how-to-calculate-cohens-kappa-coefficient-that-measures-inter-rater-agreement

In [20]:
log_cv = LogisticRegressionCV(Cs= list(np.power(10.0, np.arange(-10, 10))), cv = 5, penalty='elasticnet'
                              ,max_iter=150, solver='saga', n_jobs=-1, refit=True, multi_class='multinomial'
                              ,l1_ratios= [0, 1])

In [21]:
log_cv.fit(X_train, y_train)

LogisticRegressionCV(Cs=[1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001,
                         0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0,
                         100000.0, 1000000.0, 10000000.0, 100000000.0,
                         1000000000.0],
                     cv=5, l1_ratios=[0, 1], max_iter=150,
                     multi_class='multinomial', n_jobs=-1, penalty='elasticnet',
                     solver='saga')

In [26]:
lcv_pred = log_cv.predict(X_test)

In [37]:
f1_lcv = f1_score(y_test, lcv_pred, average='macro')
f1_lcv

0.12930148869045086

In [29]:
acc = accuracy_score(y_test, lcv_pred)
acc

0.8266682212117679

**DecisionTree Classifier**

In [85]:
dt_clf = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

dt_f1 = f1_score(y_test, dt_clf, average='macro')
dt_f1

0.2726779671131477

In [87]:
accuracy_score(y_test, dt_clf)

0.7430336947650694

**RandomForest Classification**

In [74]:
rf_clf = RandomForestClassifier().fit(X_train, y_train)

rf_clf_pred = rf_clf.predict(X_test)

rf_f1 = f1_score(y_test, rf_clf_pred, average='macro')

rf_f1

0.2720198949840519

In [75]:
rf_clf.score(X_test, y_test)

0.838443900353659

**ExtraTreeClassification**

In [81]:
et_clf = ExtraTreesClassifier().fit(X_train, y_train)

et_pred = et_clf.predict(X_test)

et_f1 = f1_score(y_test, et_pred, average='macro')

et_f1


0.22872785955019642

In [83]:
et_clf.score(X_test, y_test)

0.8341300376977187

**AdaBoostClassifier**

In [90]:
ada_clf = AdaBoostClassifier(base_estimator= rf_clf)

ada_clf.fit(X_train, y_train)

ada_pred = ada_clf.predict(X_test)

ada_f1 = f1_score(y_test, et_pred, average='macro')

ada_f1

0.22872785955019642

In [92]:
ada_clf.score(X_test, y_test)

0.8342077649527807

**GradientBoosting Classifier**

In [94]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)

gb_pred = gb_clf.predict(X_test)

gb_f1 = f1_score(y_test, gb_pred, average='macro')

gb_f1

0.2681330536574595

In [95]:
gb_clf.score(X_test, y_test)

0.8363841280945163

**XGBoost + XGRFBoost**

In [96]:
xgb_clf = XGBClassifier().fit(X_train, y_train)
xgbrf_clf = XGBRFClassifier().fit(X_train, y_train)




In [97]:
xg_pred = xgb_clf.predict(X_test)
xgrf_pred = xgbrf_clf.predict(X_test)

xg_f1 = f1_score(y_test, xg_pred, average='macro')
print(f"XGBoostClassifier F1 Score - {xg_f1}")
xgrf_f1 = f1_score(y_test, xg_pred, average='macro')
print(f"XGRFBoostClassifier F1 Score - {xgrf_f1}")

XGBoostClassifier F1 Score - 0.326722365218438
XGRFBoostClassifier F1 Score - 0.326722365218438


In [98]:
xgb_clf.score(X_test, y_test)

0.8406591271229257

In [99]:
xgbrf_clf.score(X_test, y_test)

0.8351016283859936

**GaussianNB**

https://w10schools.com/posts/233588_Feature-transformations-with-ensembles-of-trees#p-sphx-glr-auto-examples-ensemble-plot-feature-transformation-py

In [104]:
from sklearn.naive_bayes import GaussianNB

gnb_clf =  GaussianNB().fit(X_train, y_train)

gnb_pred = gnb_clf.predict(X_test)

gnb_f1 = f1_score(y_test, gnb_pred, average='macro')
print(f"XGRFBoostClassifier F1 Score - {gnb_f1}")

XGRFBoostClassifier F1 Score - 0.08346979956717492


In [105]:
gnb_clf.score(X_test, y_test)

0.10069565893280479

In [106]:
gnb_clf.score(X_train, y_train)

0.10324770380734004