In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb

In [3]:

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [4]:
df_train = pd.read_csv(r'C:\Users\100_rabh\1. protium\Train.csv',sep=',',encoding='utf-8',parse_dates=True)
df_test = pd.read_csv(r'C:\Users\100_rabh\1. protium\Test.csv',sep=',',encoding='utf-8',parse_dates=True)

In [5]:
print("Shape of train:", df_train.shape)
print("Shape of test:", df_test.shape)

Shape of train: (97485, 31)
Shape of test: (24371, 31)


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97485 entries, 0 to 97484
Data columns (total 31 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Patron_Salary                               97485 non-null  float64
 1   Automobile_Possession                       97485 non-null  float64
 2   Two-Wheeler_Ownership                       97485 non-null  float64
 3   Ongoing_Borrowing                           97485 non-null  float64
 4   Residence_Proprietorship                    97485 non-null  float64
 5   Offspring_Number                            97485 non-null  float64
 6   Loan_Capital                                97485 non-null  float64
 7   Borrowing_Periodic_Payment                  97485 non-null  float64
 8   Customer_Revenue_Category                   97485 non-null  object 
 9   Patron_Academic_Qualification               97485 non-null  object 
 10  Customer_C

In [7]:
for i in df_train.columns:
    if df_train[i].dtypes == 'object':
        df_train[i] = df_train[i].str.upper()
        df_train[i] = df_train[i].str.strip()
        
        df_test[i] = df_test[i].str.upper()
        df_test[i] = df_test[i].str.strip()

In [8]:
# converting float to integer if column have 2 unique value
for i in df_train.columns:
    if df_train[i].dtypes == 'float':
        if df_train[i].nunique() <= 2:
            df_train[i] = df_train[i].astype(int)

In [9]:
# converting float to integer if column have 2 unique value
for i in df_test.columns:
    if df_test[i].dtypes == 'float':
        if df_test[i].nunique() <= 2:
            df_test[i] = df_test[i].astype(int)

### IV

In [10]:
def iv_woe(data, target, bins, show_woe=False):
    
    #Empty Dataframe
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
        d['Non-Events'] = d['N'] - d['Events']
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        d.insert(loc=0, column='Variable', value=ivars)
        print("Information value of " + ivars + " is " + str(round(d['IV'].sum(),6)))
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        woeDF=pd.concat([woeDF,d], axis=0)

        #Show WOE Table
        if show_woe == True:
            print(d)
    return newDF, woeDF

In [11]:
iv, woe = iv_woe(data = df_train, target = 'Default', bins = 4, show_woe = True)

Information value of Patron_Salary is 0.010092
        Variable               Cutoff      N  Events  % of Events  Non-Events  \
0  Patron_Salary  (8099.999, 11250.0]  30985    2573     0.326606       28412   
1  Patron_Salary   (11250.0, 14400.0]  19226    1657     0.210333       17569   
2  Patron_Salary   (14400.0, 20250.0]  25278    2162     0.274435       23116   
3  Patron_Salary   (20250.0, 27000.0]  21996    1486     0.188627       20510   

   % of Non-Events       WoE        IV  
0         0.317073  0.029620  0.000282  
1         0.196067  0.070232  0.001002  
2         0.257971  0.061868  0.001019  
3         0.228888 -0.193465  0.007789  
Information value of Automobile_Possession is 0.007117
                Variable  Cutoff      N  Events  % of Events  Non-Events  \
0  Automobile_Possession       0  65027    5539     0.703097       59488   
1  Automobile_Possession       1  32458    2339     0.296903       30119   

   % of Non-Events       WoE        IV  
0         0.66387

Information value of Customer_Urban_Area_Ranking is 0.042981
                      Variable  Cutoff      N  Events  % of Events  \
0  Customer_Urban_Area_Ranking     1.0  10768     568     0.072100   
1  Customer_Urban_Area_Ranking     2.0  73051    5757     0.730769   
2  Customer_Urban_Area_Ranking     3.0  13666    1553     0.197131   

   Non-Events  % of Non-Events       WoE        IV  
0       10200         0.113830 -0.456662  0.019057  
1       67294         0.750990 -0.027295  0.000552  
2       12113         0.135179  0.377269  0.023373  
Information value of Request_Submission_Day is 0.002219
                 Variable  Cutoff      N  Events  % of Events  Non-Events  \
0  Request_Submission_Day     1.0  20792    1553     0.197131       19239   
1  Request_Submission_Day     2.0  16744    1413     0.179360       15331   
2  Request_Submission_Day     3.0  18048    1502     0.190658       16546   
3  Request_Submission_Day     4.0  15721    1261     0.160066       14460   
4  Re

In [12]:
# iv.to_excel('IV.xlsx', index = False)

### VIF

In [13]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(df1):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = df1.columns
    vif["VIF"] = [float(format(variance_inflation_factor(df1.values, i),'f') )for i in range(df1.shape[1])]

    return(vif)

In [14]:
df2 = df_train._get_numeric_data()

In [15]:
VIF=calc_vif(df2)

In [16]:
VIF.sort_values('VIF',ascending=False)

Unnamed: 0,variables,VIF
13,Cellphone_Marker,142.88031
7,Borrowing_Periodic_Payment,2.724688
5,Offspring_Number,2.63715
6,Loan_Capital,2.560769
16,Patron_Kin_Count,2.516137
9,Elderliness_in_Days,1.894194
10,Work_Duration_in_Days,1.732862
0,Patron_Salary,1.404152
17,Customer_Urban_Area_Ranking,1.387601
8,Population_Fraction_by_Region,1.216904


In [17]:
# VIF.to_excel('VIF.xlsx', index = False)

### Information Gain

In [18]:
def calc_entropy(column):
    """
    Calculate entropy given a series, list, or numpy array.
    """
    # Compute the counts of each given value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            # use log from math and set base to 2
            entropy += prob * math.log(prob, 2)
    
    return -entropy

In [19]:
def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on and target.
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    # Find the unique values in the column
    values = data[split_name].unique()
    
    print(split_name, ':', values)
    left_split = (data[data[split_name] == values[0]])
    right_split = (data[data[split_name] == values[1]])
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0])
        to_subtract += prob * calc_entropy(subset[target_name])
        
    # Return information gain
    return original_entropy - to_subtract

In [20]:
inf_gain = []

for i in df_train.columns:
    try:
        inf = calc_information_gain(df_train, i, 'Default')
    except:
        inf = "error"
        
    inf_gain.append({
        'VARIABLE': i,
        'Information Gain': inf
    })

df_inf_gain = pd.DataFrame(inf_gain)

In [21]:
# df_inf_gain.to_excel('IG.xlsx', index = False)
# df_inf_gain.head()

In [22]:
model_var = [
    'Patron_Salary',
    'Automobile_Possession',
    'Offspring_Number',
    'Loan_Capital',
    'Patron_Academic_Qualification',
    'Patron_Sex',
    'Borrowing_Agreement_Category',
    'Customer_Living_Arrangement',
    'Elderliness_in_Days',
    'Work_Duration_in_Days',
    'Identity_Age_in_Days',
    'Employment_Phone_Operation',
    'Patron_Constant_Correspondence_Marker',
    'Rating_Origin_2',
    'Telecommunication_Switch',
]

### WOE Binning

In [23]:
num_biv = pd.read_excel(r'C:\Users\100_rabh\1. protium\model\bivariate\numerical_bi_variate.xlsx')

In [24]:
num_biv.head()

Unnamed: 0,variable,bin,good,bad,count,percentage
0,Patron_Salary,"[-inf,22500.0)",70132,6491,76623,0.085
1,Patron_Salary,"[22500.0,26500.0)",8468,665,9133,0.073
2,Patron_Salary,"[26500.0,inf)",11007,722,11729,0.062
3,Automobile_Possession,"[-inf,1.0)",59488,5539,65027,0.085
4,Automobile_Possession,"[1.0,inf)",30119,2339,32458,0.072


In [25]:
def find(text):
    str1 = text

    return (str1.translate({ord(i): None for i in '%'}))

In [26]:
num_biv['bin'] = num_biv.apply(lambda x: find(x['bin']), axis=1)

### NUMERICAL VARIABLE BINNING

In [27]:
def num_label(row, col, conditions):
    if len(conditions) == 2:
        if (row[col] < conditions[0]):
            return 'Group0'
        elif (conditions[0] <= row[col]) & (row[col] < conditions[1]):
            return 'Group1'
        else:
            return 'Group2'
        
    elif len(conditions) == 3:
        if (row[col] < conditions[0]):
            return 'Group0'
        elif (conditions[0] <= row[col]) & (row[col] < conditions[1]):
            return 'Group1'
        elif (conditions[1] <= row[col]) & (row[col] < conditions[2]):
            return 'Group2'
        else:
            return 'Group3'
        
    elif len(conditions) == 4:
        if (row[col] < conditions[0]):
            return 'Group0'
        elif (conditions[0] <= row[col]) & (row[col] < conditions[1]):
            return 'Group1'
        elif (conditions[1] <= row[col]) & (row[col] < conditions[2]):
            return 'Group2'
        elif (conditions[2] <= row[col]) & (row[col] < conditions[3]):
            return 'Group3'
        else:
            return'Group4'
    
    elif len(conditions) == 5:
        if (row[col] < conditions[0]):
            return 'Group0'
        elif (conditions[0] <= row[col]) & (row[col] < conditions[1]):
            return 'Group1'
        elif (conditions[1] <= row[col]) & (row[col] < conditions[2]):
            return 'Group2'
        elif (conditions[2] <= row[col]) & (row[col] < conditions[3]):
            return 'Group3'
        elif (conditions[3] <= row[col]) & (row[col] < conditions[4]):
            return'Group4'
        else:
            return 'Group5'
    
    else:
        return 'NONE'
        
def catg_bin_num(row, col, num_biv):
    num_biv_1 = num_biv[num_biv['variable'] == col].reset_index(drop = True)
    conditions = []
    for i in num_biv_1.index:
        conditions.append(float(num_biv_1['bin'][i][1:-1].split(',')[1]))
    conditions = conditions[:-1]
    print(conditions)
    row[col + str('_BIN')] = row.apply(lambda row: num_label(row, col, conditions), axis=1)
    return row

In [28]:
df_train = catg_bin_num(df_train, 'Patron_Salary', num_biv)
df_test = catg_bin_num(df_test, 'Patron_Salary', num_biv)

[22500.0, 26500.0]
[22500.0, 26500.0]


In [29]:
df_train['Patron_Salary_BIN'].value_counts()

Group0    76623
Group2    11729
Group1     9133
Name: Patron_Salary_BIN, dtype: int64

In [30]:
df_train = catg_bin_num(df_train, 'Loan_Capital', num_biv)
df_test = catg_bin_num(df_test, 'Loan_Capital', num_biv)

[66000.0, 90000.0, 108000.0]
[66000.0, 90000.0, 108000.0]


In [31]:
df_train['Loan_Capital_BIN'].value_counts()

Group0    62405
Group1    15043
Group3    11784
Group2     8253
Name: Loan_Capital_BIN, dtype: int64

In [32]:
df_train = catg_bin_num(df_train, 'Elderliness_in_Days', num_biv)
df_test = catg_bin_num(df_test, 'Elderliness_in_Days', num_biv)

[12600.0, 14800.0, 20200.0]
[12600.0, 14800.0, 20200.0]


In [33]:
df_train['Elderliness_in_Days_BIN'].value_counts()

Group2    36413
Group0    24988
Group3    20348
Group1    15736
Name: Elderliness_in_Days_BIN, dtype: int64

In [34]:
df_train = catg_bin_num(df_train, 'Work_Duration_in_Days', num_biv)
df_test = catg_bin_num(df_test, 'Work_Duration_in_Days', num_biv)

[1000.0, 2000.0, 4000.0]
[1000.0, 2000.0, 4000.0]


In [35]:
df_train['Work_Duration_in_Days_BIN'].value_counts()

Group3    30390
Group0    25241
Group2    22626
Group1    19228
Name: Work_Duration_in_Days_BIN, dtype: int64

In [36]:
df_train = catg_bin_num(df_train, 'Identity_Age_in_Days', num_biv)
df_test = catg_bin_num(df_test, 'Identity_Age_in_Days', num_biv)

[1600.0, 4100.0, 4600.0]
[1600.0, 4100.0, 4600.0]


In [37]:
df_train['Identity_Age_in_Days_BIN'].value_counts()

Group1    45323
Group0    21724
Group2    17690
Group3    12748
Name: Identity_Age_in_Days_BIN, dtype: int64

In [38]:
df_train = catg_bin_num(df_train, 'Identity_Age_in_Days', num_biv)
df_test = catg_bin_num(df_test, 'Identity_Age_in_Days', num_biv)

[1600.0, 4100.0, 4600.0]
[1600.0, 4100.0, 4600.0]


In [39]:
df_train['Identity_Age_in_Days_BIN'].value_counts()

Group1    45323
Group0    21724
Group2    17690
Group3    12748
Name: Identity_Age_in_Days_BIN, dtype: int64

In [40]:
df_train = catg_bin_num(df_train, 'Rating_Origin_2', num_biv)
df_test = catg_bin_num(df_test, 'Rating_Origin_2', num_biv)

[0.23, 0.4600000000000002, 0.6700000000000004, 0.7200000000000004]
[0.23, 0.4600000000000002, 0.6700000000000004, 0.7200000000000004]


In [41]:
df_train['Rating_Origin_2_BIN'].value_counts()

Group2    45489
Group1    20038
Group3    11919
Group0    10271
Group4     9768
Name: Rating_Origin_2_BIN, dtype: int64

In [42]:
df_train = catg_bin_num(df_train, 'Telecommunication_Switch', num_biv)
df_test = catg_bin_num(df_test, 'Telecommunication_Switch', num_biv)

[1050.0, 1950.0]
[1050.0, 1950.0]


In [43]:
df_train['Telecommunication_Switch_BIN'].value_counts()

Group0    59529
Group1    25119
Group2    12837
Name: Telecommunication_Switch_BIN, dtype: int64

### CATEGORICAL VARIABLE BINNING 

In [44]:
def label(row, col, conditions):
    for i in range(len(conditions)):
        if row[col] in conditions[i]:
            return 'Group' + str(i)
        
    
def catg_bin(row, col, num_biv):
    num_biv_1 = num_biv[num_biv['variable'] == col].reset_index(drop = True)
    conditions = []
    for i in num_biv_1.index:
        conditions.append(num_biv_1['bin'][i].split(','))
    
    print(conditions)
    row[col + str('_BIN')] = row.apply(lambda row: label(row, col, conditions), axis=1)
    return row

In [45]:
df_train = catg_bin(df_train, 'Patron_Academic_Qualification', num_biv)
df_test = catg_bin(df_test, 'Patron_Academic_Qualification', num_biv)

[['POST GRAD', 'GRADUATION'], ['GRADUATION DROPOUT', 'SECONDARY', 'JUNIOR SECONDARY']]
[['POST GRAD', 'GRADUATION'], ['GRADUATION DROPOUT', 'SECONDARY', 'JUNIOR SECONDARY']]


In [46]:
df_train['Patron_Academic_Qualification_BIN'].value_counts()

Group1    74343
Group0    23142
Name: Patron_Academic_Qualification_BIN, dtype: int64

In [47]:
df_train['Patron_Academic_Qualification_BIN'].replace({'Group0': 0, 'Group1': 1}, inplace = True)
df_test['Patron_Academic_Qualification_BIN'].replace({'Group0': 0, 'Group1': 1}, inplace = True)

In [48]:
df_train['Patron_Academic_Qualification_BIN'].value_counts()

1    74343
0    23142
Name: Patron_Academic_Qualification_BIN, dtype: int64

In [49]:
df_test['Patron_Academic_Qualification'].value_counts()

SECONDARY             17524
GRADUATION             5727
GRADUATION DROPOUT      800
JUNIOR SECONDARY        304
POST GRAD                16
Name: Patron_Academic_Qualification, dtype: int64

In [50]:
df_test['Patron_Academic_Qualification_BIN'].value_counts()

1    18628
0     5743
Name: Patron_Academic_Qualification_BIN, dtype: int64

In [51]:
df_train = catg_bin(df_train, 'Patron_Sex', num_biv)
df_test = catg_bin(df_test, 'Patron_Sex', num_biv)

[['MALE'], ['FEMALE']]
[['MALE'], ['FEMALE']]


In [52]:
df_train['Patron_Sex_BIN'].value_counts()

Group0    64650
Group1    32835
Name: Patron_Sex_BIN, dtype: int64

In [53]:
df_train['Patron_Sex_BIN'].replace({'Group0': 0, 'Group1': 1}, inplace = True)
df_test['Patron_Sex_BIN'].replace({'Group0': 0, 'Group1': 1}, inplace = True)

In [54]:
df_train['Patron_Sex_BIN'].value_counts()

0    64650
1    32835
Name: Patron_Sex_BIN, dtype: int64

In [55]:
df_train = catg_bin(df_train, 'Borrowing_Agreement_Category', num_biv)
df_test = catg_bin(df_test, 'Borrowing_Agreement_Category', num_biv)

[['RL'], ['CL']]
[['RL'], ['CL']]


In [56]:
df_train['Borrowing_Agreement_Category_BIN'].value_counts()

Group1    88659
Group0     8826
Name: Borrowing_Agreement_Category_BIN, dtype: int64

In [57]:
df_train['Borrowing_Agreement_Category_BIN'].replace({'Group0': 0, 'Group1': 1}, inplace = True)
df_test['Borrowing_Agreement_Category_BIN'].replace({'Group0': 0, 'Group1': 1}, inplace = True)

In [58]:
df_train['Borrowing_Agreement_Category_BIN'].value_counts()

1    88659
0     8826
Name: Borrowing_Agreement_Category_BIN, dtype: int64

In [59]:
df_train = catg_bin(df_train, 'Customer_Living_Arrangement', num_biv)
df_test = catg_bin(df_test, 'Customer_Living_Arrangement', num_biv)

[['OFFICE', 'HOME', 'MUNICIPAL'], ['SHARED', 'FAMILY', 'RENTAL']]
[['OFFICE', 'HOME', 'MUNICIPAL'], ['SHARED', 'FAMILY', 'RENTAL']]


In [60]:
df_train['Customer_Living_Arrangement_BIN'].value_counts()

Group0    91053
Group1     6432
Name: Customer_Living_Arrangement_BIN, dtype: int64

In [61]:
df_train['Customer_Living_Arrangement_BIN'].replace({'Group0': 0, 'Group1': 1}, inplace = True)
df_test['Customer_Living_Arrangement_BIN'].replace({'Group0': 0, 'Group1': 1}, inplace = True)


In [62]:
df_train['Customer_Living_Arrangement_BIN'].value_counts()

0    91053
1     6432
Name: Customer_Living_Arrangement_BIN, dtype: int64

In [63]:
df_train = catg_bin(df_train, 'Patron_Constant_Correspondence_Marker', num_biv)
df_test = catg_bin(df_test, 'Patron_Constant_Correspondence_Marker', num_biv)

[['YES'], ['NO']]
[['YES'], ['NO']]


In [64]:
df_train['Patron_Constant_Correspondence_Marker_BIN'].value_counts()

Group0    89937
Group1     7548
Name: Patron_Constant_Correspondence_Marker_BIN, dtype: int64

In [65]:
df_train['Patron_Constant_Correspondence_Marker_BIN'].replace({'Group0': 0, 'Group1': 1}, inplace = True)
df_test['Patron_Constant_Correspondence_Marker_BIN'].replace({'Group0': 0, 'Group1': 1}, inplace = True)


In [66]:
df_train['Patron_Constant_Correspondence_Marker_BIN'].value_counts()

0    89937
1     7548
Name: Patron_Constant_Correspondence_Marker_BIN, dtype: int64

In [67]:
train = df_train.copy()
test = df_test.copy()

### Model Development

In [68]:
df_train = train.copy()
df_test = test.copy()

In [69]:
model_var = [
#     'Patron_Salary_BIN',
    'Patron_Salary',
    'Automobile_Possession',
#     'Offspring_Number',
    'Loan_Capital',
#     'Loan_Capital_BIN',
    'Patron_Academic_Qualification_BIN',
#     'Patron_Sex_BIN',
    'Borrowing_Agreement_Category_BIN',
    'Customer_Living_Arrangement_BIN',
    'Elderliness_in_Days',
#     'Elderliness_in_Days_BIN',
    'Work_Duration_in_Days',
#     'Work_Duration_in_Days_BIN',
    'Identity_Age_in_Days',
#     'Identity_Age_in_Days_BIN',
    'Employment_Phone_Operation',
    'Patron_Constant_Correspondence_Marker_BIN',
    'Rating_Origin_2',
#     'Rating_Origin_2_BIN',
    'Telecommunication_Switch',
#     'Telecommunication_Switch_BIN',
]

In [70]:
df_train1 = df_train[model_var]

cat_vars = [i for i in df_train1.columns if df_train1[i].dtype == 'object']

for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(df_train1[var], prefix=var)
    df_train1=df_train1.join(cat_list)
    
df_train1=df_train1.drop(cat_vars,axis=1)

df_test1 = df_test[model_var]
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(df_test1[var], prefix=var)
    df_test1=df_test1.join(cat_list)
    
df_test1=df_test1.drop(cat_vars,axis=1)

print("Training set : ",df_train1.shape)
print("Test set : ",df_test1.shape)

Training set :  (97485, 13)
Test set :  (24371, 13)


In [71]:
def maximum(a, b):
    if a != b:
        if a[0] >= b[0]:
            largest = a[1]
        else:
            largest = b[1]
            
        return largest
    else:
        print("Equal shapes")
        
tr1 = [len(df_train1.columns), "train"]
te1 = [len(df_test1.columns), "test"]

largest = maximum(tr1, te1)
print("Maximum shape of", largest, "dataframe!!!")

Maximum shape of train dataframe!!!


In [72]:
tr_col = set(df_train1.columns.to_list())
te_col = set(df_test1.columns.to_list())

if largest == 'train':
    x = tr_col.intersection(te_col)
    for i in list(tr_col):
        if i not in list(x):
            print("test:", i)
            df_test1[i] = 0

if largest == 'test':
    x = te_col.intersection(tr_col)
    for i in list(tr_col):
        if i not in list(x):
            print("train:", i)
            df_train1[i] = 0

else:
    pass

print("----------------------------------------")
print("Training set: ", df_train1.shape)
print("Testing set: ", df_test1.shape)

----------------------------------------
Training set:  (97485, 13)
Testing set:  (24371, 13)


In [73]:
X_train = df_train1.copy()
y_train = df_train['Default']
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("\n")

X_test = df_test1.copy()
y_test = df_test['Default']
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)
print("\n")

Shape of X_train:  (97485, 13)
Shape of y_train:  (97485,)


Shape of X_test:  (24371, 13)
Shape of y_test:  (24371,)




In [74]:
f_imp = [
    'Rating_Origin_2', 'Work_Duration_in_Days',
       'Patron_Academic_Qualification_BIN', 'Elderliness_in_Days',
       'Telecommunication_Switch',
       'Customer_Living_Arrangement_BIN', 'Automobile_Possession',
       'Patron_Constant_Correspondence_Marker_BIN',
       'Identity_Age_in_Days', 'Loan_Capital',
#        'Borrowing_Agreement_Category_BIN', 'Patron_Salary',
#        'Employment_Phone_Operation'
]

In [75]:
X_train = X_train[f_imp]
X_test = X_test[f_imp]

In [76]:
X_train.shape

(97485, 10)

In [77]:
X_train.head(2)

Unnamed: 0,Rating_Origin_2,Work_Duration_in_Days,Patron_Academic_Qualification_BIN,Elderliness_in_Days,Telecommunication_Switch,Customer_Living_Arrangement_BIN,Automobile_Possession,Patron_Constant_Correspondence_Marker_BIN,Identity_Age_in_Days,Loan_Capital
0,0.221772,365243.0,1,20099.0,0.0,0,0,0,3514.0,101961.0
1,0.720076,1344.0,0,10534.0,1745.0,0,1,0,3131.0,112500.0


In [78]:
X_test.shape

(24371, 10)

In [79]:
X_test.head(2)

Unnamed: 0,Rating_Origin_2,Work_Duration_in_Days,Patron_Academic_Qualification_BIN,Elderliness_in_Days,Telecommunication_Switch,Customer_Living_Arrangement_BIN,Automobile_Possession,Patron_Constant_Correspondence_Marker_BIN,Identity_Age_in_Days,Loan_Capital
0,0.552795,5102.0,1,16790.0,277.0,1,0,0,754.0,59527.35
1,0.719935,2263.0,0,14816.0,1799.0,0,0,0,4640.0,67500.0


In [80]:
X_train.columns

Index(['Rating_Origin_2', 'Work_Duration_in_Days',
       'Patron_Academic_Qualification_BIN', 'Elderliness_in_Days',
       'Telecommunication_Switch', 'Customer_Living_Arrangement_BIN',
       'Automobile_Possession', 'Patron_Constant_Correspondence_Marker_BIN',
       'Identity_Age_in_Days', 'Loan_Capital'],
      dtype='object')

### XG Boost

In [81]:
import xgboost as xgb
xgb_cl = xgb.XGBClassifier(objective='binary:logistic', booster='gbtree',learning_rate=0.18,max_depth=6,
                               base_score=0.4, n_estimators=200, num_parallel_tree=5, eval_metric='auc',
                               tree_method='hist', grow_policy='lossguide', scale_pos_weight = 11,
                               gamma = 89, colsample_bytree = 0.5
                              ).fit(X_train, y_train)

y_pred_xg1 = xgb_cl.predict(X_train)
y_pred_xg = xgb_cl.predict(X_test)

X_train_prob = list(xgb_cl.predict_proba(X_train)[:,1])
X_test_prob = list(xgb_cl.predict_proba(X_test)[:,1])

train['Prob_score_XGB'] = X_train_prob
test['Prob_score_XGB'] = X_test_prob

In [82]:
print("ROC AUC Score for train:", roc_auc_score(y_train, X_train_prob) * 100)
print("ROC AUC Score for test:", roc_auc_score(y_test, X_test_prob) * 100)

ROC AUC Score for train: 69.81916264390328
ROC AUC Score for test: 70.07874506213803


In [83]:
feat_imp = pd.DataFrame.from_dict(zip(list(X_train.columns), xgb_cl.feature_importances_*100)).sort_values(by = 1, ascending = False)
# feat_imp.to_excel('feature_imp_xgb.xlsx', index = False)
feat_imp

Unnamed: 0,0,1
0,Rating_Origin_2,30.140537
1,Work_Duration_in_Days,15.596639
2,Patron_Academic_Qualification_BIN,12.474005
3,Elderliness_in_Days,9.749474
4,Telecommunication_Switch,7.350151
8,Identity_Age_in_Days,5.586968
7,Patron_Constant_Correspondence_Marker_BIN,5.210736
9,Loan_Capital,5.106344
5,Customer_Living_Arrangement_BIN,4.696291
6,Automobile_Possession,4.08886


In [84]:
feat_imp[0].values

array(['Rating_Origin_2', 'Work_Duration_in_Days',
       'Patron_Academic_Qualification_BIN', 'Elderliness_in_Days',
       'Telecommunication_Switch', 'Identity_Age_in_Days',
       'Patron_Constant_Correspondence_Marker_BIN', 'Loan_Capital',
       'Customer_Living_Arrangement_BIN', 'Automobile_Possession'],
      dtype=object)

In [85]:
def ks(data=None,target=None, prob=None):
    data['target_xgb'] = 1 - data[target]
    data['bucket'] = pd.qcut(data[prob], 10)
    grouped = data.groupby('bucket', as_index = False)
    kstable = pd.DataFrame()
    kstable['min_prob'] = grouped.min()[prob]
    kstable['max_prob'] = grouped.max()[prob]
    kstable['total'] = (grouped.sum()[target] + grouped.sum()['target_xgb'])
    kstable['events']   = grouped.sum()[target]
    kstable['nonevents'] = grouped.sum()['target_xgb']
    kstable = kstable.sort_values(by="min_prob", ascending=False).reset_index(drop = True)
    kstable['event_rate'] = (kstable.events / data[target].sum()).apply('{0:.2%}'.format)
    kstable['nonevent_rate'] = (kstable.nonevents / data['target_xgb'].sum()).apply('{0:.2%}'.format)
    kstable['cum_eventrate']=(kstable.events / data[target].sum()).cumsum()
    kstable['cum_noneventrate']=(kstable.nonevents / data['target_xgb'].sum()).cumsum()
    kstable['KS'] = np.round(kstable['cum_eventrate']-kstable['cum_noneventrate'], 3) * 100

    #Formating
    kstable['cum_eventrate']= kstable['cum_eventrate'].apply('{0:.2%}'.format)
    kstable['cum_noneventrate']= kstable['cum_noneventrate'].apply('{0:.2%}'.format)
    
    kstable['ks_stats'] = np.round(((kstable['events'] / kstable['events'].sum()).cumsum() -(kstable['nonevents'] / kstable['nonevents'].sum()).cumsum()), 4) * 100
    kstable['max_ks'] = kstable['ks_stats'].apply(lambda x: '*****' if x == kstable['ks_stats'].max() else '')
    
    kstable.index = range(1,11)
    kstable.index.rename('Decile', inplace=True)
    pd.set_option('display.max_columns', 20)
#    print(kstable)
#    print(type(kstable))    
    #Display KS
    from colorama import Fore
#   print(Fore.RED + "KS is " + str(max(kstable['KS']))+"%"+ " at decile " + str((kstable.index[kstable['KS']==max(kstable['KS'])][0])))
#     print(type(kstable))
    return(kstable)


In [86]:
xg_train_ks = ks(train,'Default','Prob_score_XGB')
xg_test_ks = ks(test,'Default','Prob_score_XGB')

In [87]:
# xg_train_ks.to_csv('xgboost_ks_train.csv', index = False)
xg_train_ks

Unnamed: 0_level_0,min_prob,max_prob,total,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS,ks_stats,max_ks
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.642283,0.792226,9749,2055,7694,26.09%,8.59%,26.09%,8.59%,17.5,17.5,
2,0.577387,0.642279,9748,1332,8416,16.91%,9.39%,42.99%,17.98%,25.0,25.01,
3,0.52731,0.577369,9749,1012,8737,12.85%,9.75%,55.84%,27.73%,28.1,28.11,
4,0.481428,0.527292,9748,830,8918,10.54%,9.95%,66.37%,37.68%,28.7,28.69,*****
5,0.439028,0.481428,9748,693,9055,8.80%,10.11%,75.17%,47.79%,27.4,27.38,
6,0.402182,0.439024,9749,593,9156,7.53%,10.22%,82.70%,58.00%,24.7,24.69,
7,0.364306,0.40218,9748,486,9262,6.17%,10.34%,88.87%,68.34%,20.5,20.53,
8,0.323492,0.364303,9749,399,9350,5.06%,10.43%,93.93%,78.78%,15.2,15.16,
9,0.276227,0.32349,9748,294,9454,3.73%,10.55%,97.66%,89.33%,8.3,8.34,
10,0.167283,0.276211,9749,184,9565,2.34%,10.67%,100.00%,100.00%,0.0,0.0,


In [88]:
# xg_test_ks.to_csv('xgboost_ks_test.csv', index = False)
xg_test_ks

Unnamed: 0_level_0,min_prob,max_prob,total,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS,ks_stats,max_ks
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.643541,0.78959,2437,508,1929,25.83%,8.61%,25.83%,8.61%,17.2,17.22,
2,0.577931,0.643538,2437,337,2100,17.13%,9.37%,42.96%,17.98%,25.0,24.98,
3,0.527525,0.577921,2437,258,2179,13.12%,9.73%,56.08%,27.71%,28.4,28.37,
4,0.482248,0.527515,2437,223,2214,11.34%,9.88%,67.41%,37.59%,29.8,29.82,*****
5,0.440787,0.482212,2437,170,2267,8.64%,10.12%,76.05%,47.71%,28.3,28.34,
6,0.404019,0.440754,2436,149,2287,7.57%,10.21%,83.63%,57.92%,25.7,25.71,
7,0.363923,0.404018,2438,97,2341,4.93%,10.45%,88.56%,68.37%,20.2,20.19,
8,0.3251,0.363898,2437,94,2343,4.78%,10.46%,93.34%,78.83%,14.5,14.51,
9,0.280217,0.325097,2437,78,2359,3.97%,10.53%,97.31%,89.35%,8.0,7.95,
10,0.175559,0.280198,2438,53,2385,2.69%,10.65%,100.00%,100.00%,0.0,0.0,


In [89]:


# Init classifier
def best_result(eta, md, npt, gam, train, test):
    xgb_cl = xgb.XGBClassifier(objective='binary:logistic', booster='gbtree',learning_rate=eta,max_depth=md,
                               base_score=0.4, n_estimators=200, num_parallel_tree=npt, eval_metric='auc',
                               tree_method='hist', grow_policy='lossguide', scale_pos_weight = 9,
                               gamma = gam, colsample_bytree = 0.5
                              ).fit(X_train, y_train)

    y_pred_xg1 = xgb_cl.predict(X_train)
    y_pred_xg = xgb_cl.predict(X_test)
    
    X_train_prob = list(xgb_cl.predict_proba(X_train)[:,1])
    X_test_prob = list(xgb_cl.predict_proba(X_test)[:,1])
#     print("max depth: " , md , ", num parallel tree: " , npt , "gamma: " , gam)
    train_res = roc_auc_score(y_train, X_train_prob) * 100
    test_res = roc_auc_score(y_test, X_test_prob) * 100
    
    train['Prob_score_XGB'] = X_train_prob
    test['Prob_score_XGB'] = X_test_prob

    xg_train_ks = ks(train,'Default','Prob_score_XGB')
    xg_test_ks = ks(test,'Default','Prob_score_XGB')
    
    train_3rd_cap = xg_train_ks.reset_index()['cum_eventrate'][2]
    test_3rd_cap = xg_test_ks.reset_index()['cum_eventrate'][2]
#     print(train_3rd_cap, test_3rd_cap)
#     print("ROC AUC Score for train:", train_res)
#     print("ROC AUC Score for test:", test_res)
    return eta, md, npt, gam, train_res, test_res, train_3rd_cap, test_3rd_cap

In [None]:
my_dict = {"LearningRate": [], "Max_Depth":[], "Num_Parallel_Tree":[], "Gamma":[],
           "Train_AUC": [], "Test_AUC": [], "Train_3rd_DC": [], "Test_3rd_DC": []};
for i in [0.19, 0.195, 0.2, 0.21]:
    for j in [4, 5, 6, 7]:
        for k in [2, 4, 5, 6, 7]:
            for l in [70, 75, 80, 85, 90, 95]:
                try:
                    eta, md, npt, gam, train_res, test_res, train_3rd_cap, test_3rd_cap = best_result(i, j, k, l, train, test)
                    my_dict["LearningRate"].append(eta)
                    my_dict["Max_Depth"].append(md)
                    my_dict["Num_Parallel_Tree"].append(npt)
                    my_dict["Gamma"].append(gam)
                    my_dict["Train_AUC"].append(train_res)
                    my_dict["Test_AUC"].append(test_res)
                    my_dict["Train_3rd_DC"].append(train_3rd_cap)
                    my_dict["Test_3rd_DC"].append(test_3rd_cap)
#                     print(my_dict)
                except:
                    pass

                print("------------------------------------------------------------")


------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------

------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------

------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------

In [None]:
my_dict

In [None]:
# pd.DataFrame(my_dict).to_excel('output_v1.2.xlsx', index = False)

In [None]:

# X_validation_prob = list(xgb_cl.predict_proba(X_validation)[:,1])


# validation['Prob_score_XGB'] = X_validation_prob


# print("ROC AUC Score for validation:", roc_auc_score(y_validation, X_validation_prob) * 100)


In [None]:
feat_imp = pd.DataFrame.from_dict(zip(list(X_train.columns), xgb_cl.feature_importances_*100)).sort_values(by = 1, ascending = False)
# feat_imp.to_excel('feature_imp_xgb.xlsx', index = False)
feat_imp

In [None]:
feat_imp[0].values

In [None]:
# print()
print("ROC AUC Score for train:", roc_auc_score(y_train, y_pred_xg1) * 100)
print("ROC AUC Score for test:", roc_auc_score(y_test, y_pred_xg) * 100)
# print("ROC AUC Score for valid:", roc_auc_score(y_validation, y_pred_xg_v) * 100)

In [None]:
# xg_train_ks.to_csv('xgboost_ks_train.csv', index = False)
xg_train_ks.reset_index()['cum_eventrate'][2]

In [None]:
# xg_test_ks.to_csv('xgboost_ks_test.csv', index = False)
xg_test_ks

In [None]:
# xg_validation_ks.to_csv('xgboost_ks_validation.csv', index = False)
# xg_validation_ks