Reading data

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('../input/credit-bank-datacsv/credit_bank_data.csv.csv')

In [None]:
df.head()

Dropping fields that are not significant


In [None]:
df.drop(columns=['user_id','address','email','education_level'],axis=1,inplace=True)

Calculating age from dob

In [None]:
from datetime import datetime,date
days_in_year = 365.2425
df['age']=''
for i in range(len(df)):
  birth_date = datetime.strptime(df['date_of_birth'][i], '%Y-%m-%d').date()
  df['age'][i] = int((date.today() - birth_date).days / days_in_year)
df.drop('date_of_birth',axis=1,inplace=True)

Calculating net capital

In [None]:
df['capital_net']=df['capital_gain']-df['capital_loss']

One hot encoding

In [None]:
encode=['gender','workclass','marital_status','occupation','relationship','inquiry_purpose_code','institute_type','account_type','portfolio_type','asset_code','asset_class_cd']

In [None]:
for i in encode:
  data=(pd.get_dummies(df[i],dummy_na=True,prefix=i))
  df=df.join(data)
df.drop(columns=encode,axis=1,inplace=True)

Splitting into train/test datasets

In [None]:
from sklearn.model_selection import train_test_split
col=df.columns.tolist()
col.remove("approved")
x=df[col]
y=df["approved"]
trainx,testx,trainy,testy = train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
trainx.reset_index(drop=True,inplace=True)
testx.reset_index(drop=True,inplace=True)
trainy.reset_index(drop=True,inplace=True)
testy.reset_index(drop=True,inplace=True)

Scaling using minmaxscaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
 
# Fit your data on the scaler object
col=['capital_gain','capital_loss','hours_per_week','age']
for i in col:
  scaler = MinMaxScaler()
  trainx[col]=scaler.fit_transform(trainx[col])
  testx[col]=scaler.transform(testx[col])
scaler2 = MinMaxScaler()
trainx[['capital_net']]=scaler2.fit_transform(trainx[['capital_net']])
testx[['capital_net']]=scaler2.transform(testx[['capital_net']])

Checking for dataset imbalance

In [None]:
trainy.value_counts()

We see that the dataset is imbalanced.Using SMOTEENN to handle imbalanced dataset

In [None]:
from imblearn.combine import SMOTEENN 
sme = SMOTEENN(random_state=0)
trainx_over, trainy_over = sme.fit_resample(trainx, trainy)
smetest = SMOTEENN(random_state=0)
testx_over, testy_over = smetest.fit_resample(testx, testy)

In [None]:
import collections
print(collections.Counter(trainy_over))
print(collections.Counter(testy_over))

In [None]:
trainx = pd.DataFrame(trainx_over, columns=trainx.columns)
trainy=pd.DataFrame(trainy_over)
testx = pd.DataFrame(testx_over, columns=testx.columns)
testy = pd.DataFrame(testy_over)

Building model

In [None]:
from sklearn.metrics import accuracy_score

def model(model):
    model.fit(trainx,trainy)
    modelpred=model.predict(testx)
    return(accuracy_score(modelpred, testy)),modelpred

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier

knc = KNeighborsClassifier()
lgbm = lgb.LGBMClassifier()
xgb = XGBClassifier(random_state=0)
mlp = MLPClassifier(hidden_layer_sizes=(10,4,))
logreg = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier()

models=[knc,lgbm,xgb,mlp,logreg,rf]
modelpreds=[]
for i in models:
  accuracy,modelpred=(model(i))
  modelpreds.append(modelpred)
  print(accuracy)

We see that LightGBM gives the best accuracy score

Plotting Results for LGBM

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(testy, modelpreds[1]),display_labels=['False','True'])
disp.plot()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(testy, modelpreds[1]))

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(testy, modelpreds[1])
plt.plot(fpr,tpr)


Credit limit allocation


First get back the original capital_net values for the test set

In [None]:
testx['capital_net']=scaler2.inverse_transform(testx[['capital_net']])

In [None]:
testx['capital_net'].describe(percentiles=[0.90])

We see that the max value of capital_net on the test set is 99999.So since we know that the credit limit is between 1000 and 8000.A rough formula to assign credit limit is -

Credit limit=1000+(capital_net/99999)*7000  if the capital_net is >=0 and the   LGBM model predicted TRUE

Credit limit=1000 if the capital_net is <0 and the LGBM model predicted TRUE

Credit limit=0 if the model predicted FALSE

In [None]:
testx['credit_limit']=0 #default
for i in range(len(testx)):
  if(testx['capital_net'][i]>=0 and modelpreds[1][i]==1):
    testx['credit_limit'][i]=1000+testx['capital_net'][i]*7000/99999
  elif(testx['capital_net'][i]<0 and modelpreds[1][i]==1):
    testx['credit_limit'][i]=1000

In [None]:
testx['credit_limit'].describe()

In [None]:
(testx['credit_limit']).hist()

We see that the credit limit values have a max of 8000