In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [2]:

pd.pandas.set_option('display.max_columns', None)

In [3]:
df=pd.read_csv('loan.csv')

In [4]:
set(df['home_ownership'])

{'ANY', 'MORTGAGE', 'NONE', 'OTHER', 'OWN', 'RENT'}

In [5]:
df.columns

Index(['loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
       'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs',
       'revol_util', 'total_acc', 'bad_loan', 'longest_credit_length',
       'verification_status'],
      dtype='object')

In [6]:
## Always remember there way always be a chance of data leakage so we need to split the data first and then apply feature
## Engineering
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df,df['bad_loan'],test_size=0.1,random_state=0)

In [7]:
X_train.shape, X_test.shape

((147588, 15), (16399, 15))

In [8]:
# ## Let us capture all the nan values
# ## First lets handle Categorical features which are missing
# # features_nan=[feature for feature in dataset.columns if df[feature].isnull().sum()>1 and df[feature].dtypes=='O']

# for feature in features_nan:
#     print("{}: {}% missing values".format(feature,np.round(df[feature].isnull().mean(),4)))

In [9]:
## Let us capture all the nan values
## First lets handle Categorical features which are missing
cat_feat = [feature for feature in df.columns if df[feature].dtype == 'O']
for feature in cat_feat:
    print("{}: {}% missing values".format(feature,np.round(df[feature].isnull().mean())))

term: 0.0% missing values
home_ownership: 0.0% missing values
purpose: 0.0% missing values
addr_state: 0.0% missing values
verification_status: 0.0% missing values


In [10]:
cat_feat

['term', 'home_ownership', 'purpose', 'addr_state', 'verification_status']

In [11]:
df.head(50)

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
0,5000,36 months,10.65,10.0,RENT,24000.0,credit_card,AZ,27.65,0.0,83.7,9.0,0,26.0,verified
1,2500,60 months,15.27,0.0,RENT,30000.0,car,GA,1.0,0.0,9.4,4.0,1,12.0,verified
2,2400,36 months,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0.0,98.5,10.0,0,10.0,not verified
3,10000,36 months,13.49,10.0,RENT,49200.0,other,CA,20.0,0.0,21.0,37.0,0,15.0,verified
4,5000,36 months,7.9,3.0,RENT,36000.0,wedding,AZ,11.2,0.0,28.3,12.0,0,7.0,verified
5,3000,36 months,18.64,9.0,RENT,48000.0,car,CA,5.35,0.0,87.5,4.0,0,4.0,verified
6,5600,60 months,21.28,4.0,OWN,40000.0,small_business,CA,5.55,0.0,32.6,13.0,1,7.0,verified
7,5375,60 months,12.69,0.0,RENT,15000.0,other,TX,18.08,0.0,36.5,3.0,1,7.0,verified
8,6500,60 months,14.65,5.0,OWN,72000.0,debt_consolidation,AZ,16.12,0.0,20.6,23.0,0,13.0,not verified
9,12000,36 months,12.69,10.0,OWN,75000.0,debt_consolidation,CA,10.78,0.0,67.1,34.0,0,22.0,verified


In [12]:
num_feat = [feature for feature in df.columns if df[feature].dtype != 'O']
for feature in num_feat:
    print("{}: {}% missing values".format(feature,np.round(df[feature].isnull().mean())))


loan_amnt: 0.0% missing values
int_rate: 0.0% missing values
emp_length: 0.0% missing values
annual_inc: 0.0% missing values
dti: 0.0% missing values
delinq_2yrs: 0.0% missing values
revol_util: 0.0% missing values
total_acc: 0.0% missing values
bad_loan: 0.0% missing values
longest_credit_length: 0.0% missing values


In [13]:
# from sklearn.preprocessing import LabelEncoder
# label = LabelEncoder()
#df[cat_feat] = df[cat_feat].apply(label.fit_transform)
# list(df[cat_feat].classes_)

# list(df[cat_feat].inverse_transform([2, 2, 1]))


from sklearn import preprocessing
label = preprocessing.LabelEncoder()
df[cat_feat] = df[cat_feat].apply(label.fit_transform)

In [14]:
df[cat_feat].columns


Index(['term', 'home_ownership', 'purpose', 'addr_state',
       'verification_status'],
      dtype='object')

In [15]:
df.head(100)

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
0,5000,0,10.65,10.0,5,24000.0,1,3,27.65,0.0,83.7,9.0,0,26.0,1
1,2500,1,15.27,0.0,5,30000.0,0,10,1.00,0.0,9.4,4.0,1,12.0,1
2,2400,0,15.96,10.0,5,12252.0,11,14,8.72,0.0,98.5,10.0,0,10.0,0
3,10000,0,13.49,10.0,5,49200.0,9,4,20.00,0.0,21.0,37.0,0,15.0,1
4,5000,0,7.90,3.0,5,36000.0,13,3,11.20,0.0,28.3,12.0,0,7.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,24000,0,10.65,10.0,5,45000.0,2,4,14.80,0.0,55.2,9.0,0,21.0,1
96,6000,0,13.49,10.0,5,80000.0,2,4,11.58,0.0,83.8,16.0,0,9.0,0
97,6000,0,13.49,5.0,5,50000.0,2,3,15.38,0.0,91.5,7.0,0,7.0,0
98,2100,0,12.42,10.0,1,30000.0,12,18,19.72,0.0,82.9,26.0,0,16.0,0


In [16]:
set(df['term'])

{0, 1}

In [17]:
df[cat_feat].to_csv("Encodeddata.csv")

In [18]:
df.dropna(inplace=True)
#break and convert array form InDependent data
x =df.drop('bad_loan',axis=1).values
                 
                 
# #break and convert array from Denpendent data
y= df['bad_loan']

In [19]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score,classification_report, confusion_matrix
import warnings
logmodel = LogisticRegression (class_weight='balanced')
logmodel.fit(x_train,y_train)
# #Prediction
y_pred =logmodel.predict(x_test)

warnings.filterwarnings("ignore")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred)*100)
print(f1_score(y_test, y_pred, average='macro')*100)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.89      0.54      0.67     25951
           1       0.25      0.70      0.36      5649

    accuracy                           0.56     31600
   macro avg       0.57      0.62      0.52     31600
weighted avg       0.77      0.56      0.61     31600

[[13922 12029]
 [ 1722  3927]]
56.48417721518987
51.6467171023167


In [21]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score,classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

classifier_nb =GaussianNB()
classifier_nb.fit(x_train,y_train)
#for prediction 
y_pred =classifier_nb.predict(x_test)
#for checking purpose confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)
print(accuracy_score(y_test, y_pred)*100)
# print(metrics.accuracy_score(y_test, y_pred)*100,'%')
print(f1_score(y_test, y_pred, average='macro')*100)

81.14556962025317
54.135457585003564


In [22]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf= RandomForestClassifier(criterion ="entropy")
classifier_rf.fit(x_train,y_train)

#for prediction 
y_pred =classifier_rf.predict(x_test)
#for checking purpose confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)
print(accuracy_score(y_test, y_pred)*100)

print(f1_score(y_test, y_pred, average='macro')*100)

82.21518987341773
48.606449132429105


In [23]:
# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline
classifier = XGBClassifier()
classifier.fit(x_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(x_test)

# Making the Confusion Matrix
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
cm = confusion_matrix(y_test, y_pred)
print("Accuracy",metrics.accuracy_score(y_test, y_pred)*100,'%')
print("F1score",f1_score(y_test, y_pred, average='macro')*100)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print(accuracies)
print(accuracies.mean())
print(accuracies.std())
print("F1score",f1_score(y_test, y_pred, average='macro')*100)

Accuracy 82.23417721518987 %
F1score 50.483267835920095
[0.81946203 0.81914557 0.81716772 0.81803797 0.82088608 0.82080696
 0.81675765 0.81897302 0.8208719  0.81944774]
0.8191556649550666
0.0014082191363037088
F1score 50.483267835920095


In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
from sklearn import tree
clf.fit(x_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(x_test)
print("Accuracy",metrics.accuracy_score(y_test, y_pred)*100,'%')

Accuracy 82.23417721518987 %


In [29]:
import pickle
filename = 'Dt_loan.pkl'
pickle.dump(clf, open(filename, 'wb'))

In [30]:
classifier = pickle.load(open(filename, 'rb'))