In [1]:
#Importing all required libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import init_notebook_mode, download_plotlyjs, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()
import warnings
warnings.filterwarnings('ignore')
import os
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # To deal with Categorical Data in Target Vector.
from sklearn.model_selection import train_test_split  # To Split the dataset into training data and testing data.
from sklearn.model_selection import cross_val_score   # To check the accuracy of the model.
from sklearn.preprocessing import Imputer   # To deal with the missing values
from sklearn.preprocessing import StandardScaler



In [2]:
#importing dataset
df_train = pd.read_csv("credit_train.csv")
# Let us check the top 5 entries in training dataset.
df_train.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6.0,1.0,228190.0,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9.0,0.0,256329.0,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220.0,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15.0,0.0,253460.0,427174.0,0.0,0.0


In [3]:
#looking shape of data
df_train.shape

(100514, 19)

## Data Preprocessing

In [4]:

df_train['Years in current job'] = df_train['Years in current job'].map({'8 years':8, '10+ years':15,
                                        '3 years':3, '5 years':5, '< 1 year':0.5, 
                            '2 years':2, '4 years':4, '9 years':9, '7 years':7, '1 year':1, '6 years':6})


In [5]:
df_train.columns

Index(['Loan ID', 'Customer ID', 'Loan Status', 'Current Loan Amount', 'Term',
       'Credit Score', 'Annual Income', 'Years in current job',
       'Home Ownership', 'Purpose', 'Monthly Debt', 'Years of Credit History',
       'Months since last delinquent', 'Number of Open Accounts',
       'Number of Credit Problems', 'Current Credit Balance',
       'Maximum Open Credit', 'Bankruptcies', 'Tax Liens'],
      dtype='object')

In [6]:
#check null values
df_train.isnull().sum()

Loan ID                           514
Customer ID                       514
Loan Status                       514
Current Loan Amount               514
Term                              514
Credit Score                    19668
Annual Income                   19668
Years in current job             4736
Home Ownership                    514
Purpose                           514
Monthly Debt                      514
Years of Credit History           514
Months since last delinquent    53655
Number of Open Accounts           514
Number of Credit Problems         514
Current Credit Balance            514
Maximum Open Credit               516
Bankruptcies                      718
Tax Liens                         524
dtype: int64

In [7]:
#replacing numeric null values by its mean
df_train['Credit Score'].fillna((df_train['Credit Score'].mean()), inplace=True)

In [8]:
df_train['Annual Income'].fillna((df_train['Annual Income'].mean()), inplace=True)

In [9]:
df_train['Years in current job'].fillna((df_train['Years in current job'].mean()), inplace=True)

In [10]:
df_train['Months since last delinquent'].fillna((df_train['Months since last delinquent'].mean()), inplace=True)

In [11]:
df_train.isnull().sum()

Loan ID                         514
Customer ID                     514
Loan Status                     514
Current Loan Amount             514
Term                            514
Credit Score                      0
Annual Income                     0
Years in current job              0
Home Ownership                  514
Purpose                         514
Monthly Debt                    514
Years of Credit History         514
Months since last delinquent      0
Number of Open Accounts         514
Number of Credit Problems       514
Current Credit Balance          514
Maximum Open Credit             516
Bankruptcies                    718
Tax Liens                       524
dtype: int64

In [12]:
#removing null values
df_train1 = df_train.dropna()

In [13]:
df_train1.isnull().sum()

Loan ID                         0
Customer ID                     0
Loan Status                     0
Current Loan Amount             0
Term                            0
Credit Score                    0
Annual Income                   0
Years in current job            0
Home Ownership                  0
Purpose                         0
Monthly Debt                    0
Years of Credit History         0
Months since last delinquent    0
Number of Open Accounts         0
Number of Credit Problems       0
Current Credit Balance          0
Maximum Open Credit             0
Bankruptcies                    0
Tax Liens                       0
dtype: int64

In [14]:
df_train1.shape

(99794, 19)

In [15]:
df_train1.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412.0,Short Term,709.0,1167493.0,8.0,Home Mortgage,Home Improvements,5214.74,17.2,34.901321,6.0,1.0,228190.0,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,1076.456089,1378277.0,15.0,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8.0,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666.0,Long Term,721.0,806949.0,3.0,Own Home,Debt Consolidation,8741.9,12.0,34.901321,9.0,0.0,256329.0,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220.0,Short Term,1076.456089,1378277.0,5.0,Rent,Debt Consolidation,20639.7,6.1,34.901321,15.0,0.0,253460.0,427174.0,0.0,0.0


In [16]:
#looking correlation 
df_train1.corr()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Years in current job,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
Current Loan Amount,1.0,-0.095117,0.013078,-0.001337,-0.006571,0.019303,0.007542,0.001656,-0.002834,0.003992,-0.001126,-0.000611,-0.002039
Credit Score,-0.095117,1.0,-0.016989,-0.005534,-0.001457,-0.008652,-0.001999,0.005725,-0.002786,0.000168,-0.002073,-0.006258,0.004764
Annual Income,0.013078,-0.016989,1.0,0.072474,0.438551,0.144903,-0.041714,0.13158,-0.015489,0.284287,0.03899,-0.043115,0.037079
Years in current job,-0.001337,-0.005534,0.072474,1.0,0.130098,0.233219,-0.010785,0.051907,0.038074,0.102371,0.003754,0.037652,0.014956
Monthly Debt,-0.006571,-0.001457,0.438551,0.130098,1.0,0.199743,-0.039995,0.411185,-0.055744,0.481,0.039532,-0.078977,0.020025
Years of Credit History,0.019303,-0.008652,0.144903,0.233219,0.199743,1.0,-0.029013,0.132226,0.061887,0.208432,0.030742,0.066248,0.017345
Months since last delinquent,0.007542,-0.001999,-0.041714,-0.010785,-0.039995,-0.029013,1.0,-0.023085,0.077903,-0.01636,-0.002868,0.087431,0.00961
Number of Open Accounts,0.001656,0.005725,0.13158,0.051907,0.411185,0.132226,-0.023085,1.0,-0.014184,0.227776,0.030985,-0.024582,0.006499
Number of Credit Problems,-0.002834,-0.002786,-0.015489,0.038074,-0.055744,0.061887,0.077903,-0.014184,1.0,-0.112704,-0.012011,0.752941,0.581352
Current Credit Balance,0.003992,0.000168,0.284287,0.102371,0.481,0.208432,-0.01636,0.227776,-0.112704,1.0,0.138464,-0.122609,-0.015677


In [17]:
#short summary 
df_train1.describe()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Years in current job,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
count,99794.0,99794.0,99794.0,99794.0,99794.0,99794.0,99794.0,99794.0,99794.0,99794.0,99794.0,99794.0,99794.0
mean,11757280.0,1076.309761,1378327.0,7.565572,18486.117117,18.193694,34.942123,11.13091,0.168577,294660.2,757526.5,0.117743,0.029371
std,31779850.0,1326.293916,971913.8,5.482931,12172.465813,7.015261,15.010645,5.007745,0.483087,376066.8,8349533.0,0.351427,0.258433
min,15422.0,585.0,76627.0,0.5,0.0,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,179696.0,711.0,931247.0,3.0,10228.3175,13.5,34.0,8.0,0.0,112769.8,273526.0,0.0,0.0
50%,312477.0,733.0,1370052.0,6.0,16237.21,16.9,34.901321,10.0,0.0,209912.0,467874.0,0.0,0.0
75%,525096.0,750.0,1512884.0,15.0,24025.215,21.7,34.901321,14.0,0.0,368068.0,782826.0,0.0,0.0
max,100000000.0,7510.0,165557400.0,15.0,435843.28,70.5,176.0,76.0,15.0,32878970.0,1539738000.0,7.0,15.0


In [18]:
# dropping columns
df_train1.drop(labels=['Loan ID', 'Customer ID'], axis=1, inplace=True)

In [19]:
df_train1.head()

Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,Fully Paid,445412.0,Short Term,709.0,1167493.0,8.0,Home Mortgage,Home Improvements,5214.74,17.2,34.901321,6.0,1.0,228190.0,416746.0,1.0,0.0
1,Fully Paid,262328.0,Short Term,1076.456089,1378277.0,15.0,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8.0,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,Fully Paid,347666.0,Long Term,721.0,806949.0,3.0,Own Home,Debt Consolidation,8741.9,12.0,34.901321,9.0,0.0,256329.0,386958.0,0.0,0.0
4,Fully Paid,176220.0,Short Term,1076.456089,1378277.0,5.0,Rent,Debt Consolidation,20639.7,6.1,34.901321,15.0,0.0,253460.0,427174.0,0.0,0.0


# Label Encoding

In [20]:
# label encoding the data 
from sklearn.preprocessing import LabelEncoder 
  
le = LabelEncoder() 
  
df_train1['Loan Status']= le.fit_transform(df_train1['Loan Status']) 
df_train1['Term']= le.fit_transform(df_train1['Term'])
df_train1['Purpose']= le.fit_transform(df_train1['Purpose']) 
df_train1['Home Ownership']= le.fit_transform(df_train1['Home Ownership'])

In [21]:
df_train1.head()

Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,1,445412.0,1,709.0,1167493.0,8.0,1,5,5214.74,17.2,34.901321,6.0,1.0,228190.0,416746.0,1.0,0.0
1,1,262328.0,1,1076.456089,1378277.0,15.0,1,3,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,1,99999999.0,1,741.0,2231892.0,8.0,2,3,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,1,347666.0,0,721.0,806949.0,3.0,2,3,8741.9,12.0,34.901321,9.0,0.0,256329.0,386958.0,0.0,0.0
4,1,176220.0,1,1076.456089,1378277.0,5.0,3,3,20639.7,6.1,34.901321,15.0,0.0,253460.0,427174.0,0.0,0.0


## Splitting data 

In [22]:
# in "X" we take all independent variables 
x = df_train1.drop(['Loan Status'],axis=1)
x.head()

Unnamed: 0,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,445412.0,1,709.0,1167493.0,8.0,1,5,5214.74,17.2,34.901321,6.0,1.0,228190.0,416746.0,1.0,0.0
1,262328.0,1,1076.456089,1378277.0,15.0,1,3,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,99999999.0,1,741.0,2231892.0,8.0,2,3,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,347666.0,0,721.0,806949.0,3.0,2,3,8741.9,12.0,34.901321,9.0,0.0,256329.0,386958.0,0.0,0.0
4,176220.0,1,1076.456089,1378277.0,5.0,3,3,20639.7,6.1,34.901321,15.0,0.0,253460.0,427174.0,0.0,0.0


In [23]:
# in "y" we take dependent variable
y = df_train1['Loan Status']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Loan Status, dtype: int32

## Feature Engineering

In [24]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

bestfeatures = SelectKBest(score_func=chi2, k=5)# selecting best 5 features
fit = bestfeatures.fit(x,y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns

featureScores

Unnamed: 0,Specs,Score
0,Current Loan Amount,324529900000.0
1,Term,341.7139
2,Credit Score,27548680.0
3,Annual Income,149174100.0
4,Years in current job,32.37137
5,Home Ownership,133.3976
6,Purpose,0.0003503571
7,Monthly Debt,51485.38
8,Years of Credit History,150.9793
9,Months since last delinquent,54.28983


### Train test split

In [25]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.20,random_state=0)

In [26]:
x_train.shape

(79835, 16)

In [27]:
x_test.shape

(19959, 16)

In [28]:
y_train.shape

(79835,)

In [29]:
y_test.shape

(19959,)

## Standard scaler

In [30]:
sc_X=StandardScaler()
x_train=sc_X.fit_transform(x_train)
x_test = sc_X.fit_transform(x_test)

# LogisticRegression

In [31]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(x_train,y_train)


y_pred=logreg.predict(x_test)

In [32]:
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[  918,  3619],
       [    6, 15416]], dtype=int64)

In [33]:
Accuracy=print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.818377674232176
Precision: 0.8098765432098766
Recall: 0.9996109454026715


In [34]:
F1_score=(2*metrics.precision_score(y_test, y_pred)*metrics.recall_score(y_test, y_pred))/(metrics.precision_score(y_test, y_pred)+metrics.recall_score(y_test, y_pred))

In [35]:
F1_score

0.8947964129204516

## RandomForestClassifier

In [36]:
# fit the model with data
classifier_ran = RandomForestClassifier()
classifier_ran.fit(x_train,y_train)
y_pred1=classifier_ran.predict(x_test)

In [37]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred1))
print("Precision:",metrics.precision_score(y_test, y_pred1))
print("Recall:",metrics.recall_score(y_test, y_pred1))

Accuracy: 0.7898191292148905
Precision: 0.8231163299372589
Recall: 0.927246790299572


## xgboost

In [38]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [39]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=100)

In [40]:
xgb.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [41]:
preds = xgb.predict(x_test)

In [42]:
print("Accuracy:",metrics.accuracy_score(y_test, preds))
print("Precision:",metrics.precision_score(y_test, preds))
print("Recall:",metrics.recall_score(y_test, preds))

Accuracy: 0.818377674232176
Precision: 0.8096813146427259
Recall: 1.0
