In [45]:
# Import required packages
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import classification_report as cr
import pickle

In [46]:
# load dataset
cc_apps=pd.read_csv("crx.data",delimiter=",",names=["Gender","Age","Debt","Married","BankCustomer","Educational","Ethinicity","YearsEmployed","PriorDefault","Employed","CreditScore","DriversLicense","Citizen","ZipCode","Income","ApprovalStatus"])

In [47]:
cc_apps.tail(17)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Educational,Ethinicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
673,?,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


In [48]:
cc_apps.describe()

Unnamed: 0,Debt,YearsEmployed,CreditScore,Income
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [49]:
cc_apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    object 
 1   Age             690 non-null    object 
 2   Debt            690 non-null    float64
 3   Married         690 non-null    object 
 4   BankCustomer    690 non-null    object 
 5   Educational     690 non-null    object 
 6   Ethinicity      690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    object 
 9   Employed        690 non-null    object 
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    object 
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    object 
 14  Income          690 non-null    int64  
 15  ApprovalStatus  690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


In [50]:
cc_apps["Gender"].value_counts()

b    468
a    210
?     12
Name: Gender, dtype: int64

In [51]:
cc_apps.replace("?",np.nan,inplace=True)

In [52]:
cc_apps["Age"]=cc_apps["Age"].astype("float")

In [53]:
cc_apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          678 non-null    object 
 1   Age             678 non-null    float64
 2   Debt            690 non-null    float64
 3   Married         684 non-null    object 
 4   BankCustomer    684 non-null    object 
 5   Educational     681 non-null    object 
 6   Ethinicity      681 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    object 
 9   Employed        690 non-null    object 
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    object 
 12  Citizen         690 non-null    object 
 13  ZipCode         677 non-null    object 
 14  Income          690 non-null    int64  
 15  ApprovalStatus  690 non-null    object 
dtypes: float64(3), int64(2), object(11)
memory usage: 86.4+ KB


In [54]:
cc_apps.isna().sum()

Gender            12
Age               12
Debt               0
Married            6
BankCustomer       6
Educational        9
Ethinicity         9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode           13
Income             0
ApprovalStatus     0
dtype: int64

In [55]:
obj=cc_apps.columns[cc_apps.dtypes=="object"]

In [56]:
obj

Index(['Gender', 'Married', 'BankCustomer', 'Educational', 'Ethinicity',
       'PriorDefault', 'Employed', 'DriversLicense', 'Citizen', 'ZipCode',
       'ApprovalStatus'],
      dtype='object')

In [57]:
num=cc_apps.columns[cc_apps.dtypes!="object"]

In [58]:
num

Index(['Age', 'Debt', 'YearsEmployed', 'CreditScore', 'Income'], dtype='object')

In [59]:
obj_imp=SimpleImputer(strategy="most_frequent")
num_imp=SimpleImputer()

In [60]:
cc_apps[obj]=obj_imp.fit_transform(cc_apps[obj])
cc_apps[num]=num_imp.fit_transform(cc_apps[num])


In [61]:
cc_apps["ApprovalStatus"].replace({"+":1,"-":0},inplace=True)

In [62]:
cc_apps.isna().sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
Educational       0
Ethinicity        0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
ApprovalStatus    0
dtype: int64

In [63]:
le=LabelEncoder()

for col in cc_apps.columns:
    if cc_apps[col].dtypes=="object":
        cc_apps[col]=le.fit_transform(cc_apps[col])
        

In [64]:
cc_apps.drop(["ZipCode","DriversLicense"],axis=1,inplace=True)

In [65]:
X=cc_apps.drop("ApprovalStatus",axis=1)
y=cc_apps["ApprovalStatus"]

In [66]:
x_train,x_test,y_train,y_test=tts(X,y,test_size=0.2,random_state=123,stratify=y)

In [67]:
scaler=MinMaxScaler(feature_range=(0,1))
scaled_x_train=scaler.fit_transform(x_train)
scaled_x_test=scaler.transform(x_test)

In [68]:
lr=LogisticRegression()

In [69]:
lr.fit(scaled_x_train,y_train)

In [70]:
y_pred=lr.predict(scaled_x_test)

In [71]:
lr.score(x_test,y_test)



0.4492753623188406

In [72]:
cm(y_test,y_pred)

array([[67, 10],
       [10, 51]], dtype=int64)

In [73]:
print(cr(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87        77
           1       0.84      0.84      0.84        61

    accuracy                           0.86       138
   macro avg       0.85      0.85      0.85       138
weighted avg       0.86      0.86      0.86       138



<b>Overall Accuracy of the model is 0.86<b><br>
<b>We can see that precision as well as recal and f1 score are near to 0.9 so we can say that our model is good fit<b>

Pickle is used to deploy models further

In [74]:
with open('ccapproval.pkl', 'wb') as f:
    pickle.dump(lr, f)

In [75]:
# import pickle
# from sklearn.metrics import confusion_matrix as cm
# from sklearn.metrics import classification_report as cr
# with open('ccapproval.pkl', 'rb') as f:
#     model1 = pickle.load(f)
# type(model1)
# y_predd=model1.predict(x_test)
# print(cm(y_test,y_predd))
# print(cr(y_test,y_predd))