In [1]:
# importing libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV


In [2]:
# reading csv file 
cc_apps=pd.read_csv('cc_approvals.csv')


In [3]:
# print summary description

cc_apps.head()



Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


### INSPECTING THE APPLICATIONS

In [4]:
# print statistics description

desc=cc_apps.describe()
print(desc)

print('\n')

# print data frame information

info=cc_apps.info()
print(info)

print('--'*50)

# inspecting missing values 

cc_apps.tail(17)


                0        1.25          01            0.1
count  689.000000  689.000000  689.000000     689.000000
mean     4.765631    2.224819    2.402032    1018.862119
std      4.978470    3.348739    4.866180    5213.743149
min      0.000000    0.000000    0.000000       0.000000
25%      1.000000    0.165000    0.000000       0.000000
50%      2.750000    1.000000    0.000000       5.000000
75%      7.250000    2.625000    3.000000     396.000000
max     28.000000   28.500000   67.000000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   b       689 non-null    object 
 1   30.83   689 non-null    object 
 2   0       689 non-null    float64
 3   u       689 non-null    object 
 4   g       689 non-null    object 
 5   w       689 non-null    object 
 6   v       689 non-null    object 
 7   1.25    689 non-null    float64
 8   t    

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
672,?,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
673,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
674,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
675,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
676,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
677,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
678,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
679,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
680,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
681,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


### Handling the missing values---

In [5]:
# filling '?' to NaN 

cc_apps=cc_apps.replace(['?'],np.NaN)

# checking --

cc_apps.tail(17)

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
672,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
673,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
674,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
675,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
676,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
677,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
678,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
679,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
680,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
681,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


In [6]:
# convert the missing values with avg

cc_apps.fillna(cc_apps.mean(),inplace=True)
cc_apps['b'].tail(20)

669      b
670      b
671      a
672    NaN
673      a
674      a
675      a
676      b
677      a
678      a
679      b
680      b
681      b
682      b
683      b
684      b
685      a
686      a
687      b
688      b
Name: b, dtype: object

In [7]:
# Iterate over each column of cc_apps and making nan to avg value
cc_apps.fillna(method='ffill',inplace=True)

# print the count and check 

print(cc_apps.count())
cc_apps.tail(20)

b        689
30.83    689
0        689
u        689
g        689
w        689
v        689
1.25     689
t        689
t.1      689
01       689
f        689
g.1      689
00202    689
0.1      689
+        689
dtype: int64


Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
669,b,47.17,5.835,u,g,w,v,5.5,f,f,0,f,g,465,150,-
670,b,25.83,12.835,u,g,cc,v,0.5,f,f,0,f,g,0,2,-
671,a,50.25,0.835,u,g,aa,v,0.5,f,f,0,t,g,240,117,-
672,a,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
673,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
674,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
675,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
676,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
677,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
678,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-


## Preprocessing 

In [8]:
# declaring lable encoder----

lb=LabelEncoder()

for item in cc_apps:
    if cc_apps[item].dtype=='object':
#         print('inside-- ',item)
        
        # Use LabelEncoder to do the numeric transformation

        cc_apps[item]=lb.fit_transform(cc_apps[item])
        

cc_apps.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   b       689 non-null    int32  
 1   30.83   689 non-null    int32  
 2   0       689 non-null    float64
 3   u       689 non-null    int32  
 4   g       689 non-null    int32  
 5   w       689 non-null    int32  
 6   v       689 non-null    int32  
 7   1.25    689 non-null    float64
 8   t       689 non-null    int32  
 9   t.1     689 non-null    int32  
 10  01      689 non-null    int64  
 11  f       689 non-null    int32  
 12  g.1     689 non-null    int32  
 13  00202   689 non-null    int32  
 14  0.1     689 non-null    int64  
 15  +       689 non-null    int32  
dtypes: float64(2), int32(12), int64(2)
memory usage: 54.0 KB


### split the data into train set and test set

In [9]:
# splitting the data into test and train set -------

cc_apps = cc_apps.drop([cc_apps.columns[11], cc_apps.columns[13]], axis=1)
cc_apps = cc_apps.values


x,y=cc_apps[:,0:13],cc_apps[:,13]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33, random_state=42)



In [15]:
# Instantiate MinMaxScaler and use it to rescale X_train and X_test


scaler = MinMaxScaler(feature_range=(0, 1))

rescaledx_train,rescaledx_test=scaler.fit_transform(x_train),scaler.fit_transform(x_test)



###  Fitting a logistic regression model to the train set


In [17]:
# fitting the trained data into logistic regression model

logreg = LogisticRegression()

# Fit logreg to the train set

logreg.fit(rescaledx_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Making Predictions

In [19]:
# Use logreg to predict instances from the test set and store it
y_pred=logreg.predict(rescaledx_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledx_test, y_test))

# Print the confusion matrix of the logreg model
confusion_matrix(y_pred, y_test)



Accuracy of logistic regression classifier:  0.868421052631579


array([[ 95,  25],
       [  5, 103]], dtype=int64)

##  Grid searching and making the model perform better

In [33]:
# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)
print(param_grid)

{'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200]}


In [34]:
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

rescaledx=scaler.fit_transform(x,y)

res=grid_model.fit(rescaledx,y)

# Summarize results
best_score, best_params = res.best_score_, res.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.850640 using {'max_iter': 100, 'tol': 0.01}
