# 1. Importing Libraries

In [59]:
import numpy as np
import pandas as pd 

# 2. Inspecting the applications

In [60]:
# Load dataset
column_names = ["A1", "A2","A3","A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15","A16"]
df = pd.read_csv("datasets/crx.data", header= None, names=column_names)

# Inspect data
display(df.head())

# Inspect data information
display(df.info())

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      690 non-null    object 
 1   A2      690 non-null    object 
 2   A3      690 non-null    float64
 3   A4      690 non-null    object 
 4   A5      690 non-null    object 
 5   A6      690 non-null    object 
 6   A7      690 non-null    object 
 7   A8      690 non-null    float64
 8   A9      690 non-null    object 
 9   A10     690 non-null    object 
 10  A11     690 non-null    int64  
 11  A12     690 non-null    object 
 12  A13     690 non-null    object 
 13  A14     690 non-null    object 
 14  A15     690 non-null    int64  
 15  A16     690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


None

In [61]:
# Inspect missing values in the dataset
display(df.isna().sum())

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
A16    0
dtype: int64

# 3. Handling the missing values

In [62]:
# Change age df1 to numuric
df['A2'] = pd.to_numeric(df['A2'], errors='coerce')

# Drop the features A14, that is zipcode
df = df.drop(['A14'], axis=1)

In [63]:
# Missing values
df = df.replace('?', np.NaN)
# Iterate over each column of cc_apps_train
for col in df:
    # Check if the column is of object type
    if df[col].dtypes == 'object':
        # Impute with the most frequent value
        df[col] = df[col].fillna(df[col].value_counts().index[0])
    else:
        df[col] = df[col].fillna(df[col].mean())

print(df.isnull().sum())

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A15    0
A16    0
dtype: int64


# 4. Preprocessing the data
Convert the non-numeric data into numeric.

In [64]:
df['A1'] = df['A1'].apply(lambda x: 0 if x=='a' else 1)
df['A9'] = df['A9'].apply(lambda x: 0 if x=='t' else 1)
df['A10'] = df['A10'].apply(lambda x: 0 if x=='t' else 1)
df['A12'] = df['A12'].apply(lambda x: 0 if x=='t' else 1)
df['A16'] = df['A16'].apply(lambda x: 0 if x=='+' else 1)
df = pd.get_dummies(df)
display(df)

Unnamed: 0,A1,A2,A3,A8,A9,A10,A11,A12,A15,A16,...,A7_ff,A7_h,A7_j,A7_n,A7_o,A7_v,A7_z,A13_g,A13_p,A13_s
0,1,30.83,0.000,1.25,0,0,1,1,0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,58.67,4.460,3.04,0,0,6,1,560,0,...,0,1,0,0,0,0,0,1,0,0
2,0,24.50,0.500,1.50,0,1,0,1,824,0,...,0,1,0,0,0,0,0,1,0,0
3,1,27.83,1.540,3.75,0,0,5,0,3,0,...,0,0,0,0,0,1,0,1,0,0
4,1,20.17,5.625,1.71,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21.08,10.085,1.25,1,1,0,1,0,1,...,0,1,0,0,0,0,0,1,0,0
686,0,22.67,0.750,2.00,1,0,2,0,394,1,...,0,0,0,0,0,1,0,1,0,0
687,0,25.25,13.500,2.00,1,0,1,0,1,1,...,1,0,0,0,0,0,0,1,0,0
688,1,17.92,0.205,0.04,1,1,0,1,750,1,...,0,0,0,0,0,1,0,1,0,0


# Splitting the dataset into train and test sets
A16: ApprovalStatus is our target label.

In [65]:
# Import train_test_split
from sklearn.model_selection import train_test_split

y = df['A16'].values
X = df.drop(['A16'], axis=1).values
print(X.shape)
print(y.shape)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

(690, 41)
(690,)


# Fitting a logistic regression model to the train set


In [66]:
from sklearn.preprocessing import MinMaxScaler

# Instantiate MinMaxScaler and use it to rescale X_train and X_test feature_range
scaler = MinMaxScaler()
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)


In [67]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(rescaledX_train, y_train)

LogisticRegression()

## Making predictions and evaluating performance

In [68]:
# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test, y_test))

# Print the confusion matrix of the logreg model
print("confusion matrix: \n", confusion_matrix(y_test, y_pred))

Accuracy of logistic regression classifier:  0.8464912280701754
confusion matrix: 
 [[ 93  10]
 [ 25 100]]


In [69]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV
# Define the grid of values for tol and max_iter

tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol = tol , max_iter = max_iter)

<h3>6. Find a best model.</h3>
Instruct GridSearchCV() to perform a cross-validation of 5 folds.

In [70]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Fit grid_model to the data
grid_model_result = grid_model.fit(rescaledX_train, y_train)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

# Extract the best model and evaluate it on the test set
best_model = grid_model_result.best_estimator_
best_model.predict(rescaledX_test)

print("Accuracy of logistic regression classifier: ", best_model.score(rescaledX_test, y_test))

Best: 0.857083 using {'max_iter': 100, 'tol': 0.01}
Accuracy of logistic regression classifier:  0.8464912280701754
