# Part 1:

## Importing Libraries and Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, confusion_matrix, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Capstone 2/data_clean.csv')
df.head()

Unnamed: 0,male,age,debt,married,bank_customer,education_level,ethnicity,years_employed,prior_default,employed,credit_score,drivers_license,citizen,zip_code,income,approval_status
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             690 non-null    object 
 1   age              690 non-null    float64
 2   debt             690 non-null    float64
 3   married          690 non-null    object 
 4   bank_customer    690 non-null    object 
 5   education_level  690 non-null    object 
 6   ethnicity        690 non-null    object 
 7   years_employed   690 non-null    float64
 8   prior_default    690 non-null    object 
 9   employed         690 non-null    object 
 10  credit_score     690 non-null    int64  
 11  drivers_license  690 non-null    object 
 12  citizen          690 non-null    object 
 13  zip_code         690 non-null    object 
 14  income           690 non-null    int64  
 15  approval_status  690 non-null    int64  
dtypes: float64(3), int64(3), object(10)
memory usage: 86.4+ KB


In [5]:
missing = pd.concat([df.sum().isnull(), 100 * df.isnull().mean()], axis=1)
missing.columns = ['count', '%']
missing.sort_values(by = '%')

Unnamed: 0,count,%
male,False,0.0
age,False,0.0
debt,False,0.0
married,False,0.0
bank_customer,False,0.0
education_level,False,0.0
ethnicity,False,0.0
years_employed,False,0.0
prior_default,False,0.0
employed,False,0.0


In [6]:
for col in df:
    print(f'{col}: {df[col].unique()}', '\n')

male: ['b' 'a'] 

age: [30.83       58.67       24.5        27.83       20.17       32.08
 33.17       22.92       54.42       42.5        22.08       29.92
 38.25       48.08       45.83       36.67       28.25       23.25
 21.83       19.17       25.         47.75       27.42       41.17
 15.83       47.         56.58       57.42       42.08       29.25
 42.         49.5        36.75       22.58       27.25       23.
 27.75       54.58       34.17       28.92       29.67       39.58
 56.42       54.33       41.         31.92       41.5        23.92
 25.75       26.         37.42       34.92       34.25       23.33
 23.17       44.33       35.17       43.25       56.75       31.67
 23.42       20.42       26.67       36.         25.5        19.42
 32.33       34.83       38.58       44.25       44.83       20.67
 34.08       21.67       21.5        49.58       27.67       39.83
 31.56817109 37.17       25.67       34.         49.         62.5
 31.42       52.33       28.75       28.58

The missing values were successfully handled in a prior notebook.

There is still a little but essential pre-processing to perform before we start building our machine learning model. The tasks include

1. Convert the non-numeric data into numeric. 
2. Split the data into test and training sets.
3. Scale the features to a uniform range.

We being by converting all of the non-numeric data into numeric. Many machine learning models require the data to be in stricly numeric format. The will also result in faster computations. We will use _label encoding_ to accomplish this task.

## Convert the non-numeric data into numeric.

In [7]:
# Instantiate LabelEncoder
le = LabelEncoder()

for col in df:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             690 non-null    int64  
 1   age              690 non-null    float64
 2   debt             690 non-null    float64
 3   married          690 non-null    int64  
 4   bank_customer    690 non-null    int64  
 5   education_level  690 non-null    int64  
 6   ethnicity        690 non-null    int64  
 7   years_employed   690 non-null    float64
 8   prior_default    690 non-null    int64  
 9   employed         690 non-null    int64  
 10  credit_score     690 non-null    int64  
 11  drivers_license  690 non-null    int64  
 12  citizen          690 non-null    int64  
 13  zip_code         690 non-null    int64  
 14  income           690 non-null    int64  
 15  approval_status  690 non-null    int64  
dtypes: float64(3), int64(13)
memory usage: 86.4 KB


## Spliting the data into test and training sets

Now, we will split our data into train set and test set to prepare our data for two different phases of machine learning modeling: training and testing. 

Moreover, features like `drivers_license` and `zip_code` are not as important as the other features in the dataset for predicting credit card approvals. We should drop them to design our machine learning model with the best set of features.

In [9]:
# Drop the drivers_license and zip_code features
df = df.drop(['drivers_license', 'zip_code'], axis = 1)
df = df.values

# Segregate features and labels into seperate variables
X, y = df[:,0:13], df[:,-1]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

Now, we are only left with one final preprocessing step of scaling before we can fit a machine learning model to the data.

## Scale the features to a uniform range

The data is now split into two separate sets — train and test sets respectively. We are only left with one final pre-processing step of scaling before we can fit a machine learning model to the data.

Now, let’s try to understand what these scaled values mean in the real world. Let’s use `credit_score` as an example. The credit score of a person is their credit worthiness based on their credit history. The higher this number, the more financially trustworthy a person is considered to be. So, a `credit_score` of 1 is the highest since we're rescaling all the values to the range of 0-1.

In [10]:
# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
X_train

array([[0.00000000e+00, 7.41269841e-02, 3.51243592e-01, ...,
        5.97014925e-02, 0.00000000e+00, 5.40000000e-03],
       [1.00000000e+00, 5.28571429e-02, 3.22764382e-03, ...,
        0.00000000e+00, 0.00000000e+00, 7.22000000e-03],
       [1.00000000e+00, 1.38888889e-01, 4.36681223e-01, ...,
        0.00000000e+00, 0.00000000e+00, 4.00000000e-02],
       ...,
       [1.00000000e+00, 3.78253968e-01, 0.00000000e+00, ...,
        0.00000000e+00, 5.00000000e-01, 0.00000000e+00],
       [1.00000000e+00, 8.33333333e-02, 0.00000000e+00, ...,
        5.97014925e-02, 0.00000000e+00, 1.00000000e-05],
       [1.00000000e+00, 7.80952381e-02, 1.89861401e-01, ...,
        2.98507463e-02, 0.00000000e+00, 3.80000000e-04]])

In [12]:
X_test

array([[0.        , 0.28282811, 0.05695842, ..., 0.02985075, 0.        ,
        0.00105   ],
       [0.        , 0.51190476, 0.15188912, ..., 0.        , 0.        ,
        0.0096    ],
       [1.        , 0.09920635, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.25666667, 0.06967913, ..., 0.        , 0.        ,
        0.002     ],
       [0.        , 0.37428571, 0.0949307 , ..., 0.        , 0.        ,
        0.00246   ],
       [0.        , 0.14412698, 0.08695652, ..., 0.10447761, 0.        ,
        0.02384   ]])

# Building the Model

## Logistic Regression

In [13]:
# Instantiate logistic regression model
logreg = LogisticRegression(random_state=42)

# Fit the to the train
logreg.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [14]:
# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(X_test)

In [15]:
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)


results = pd.DataFrame([['Logistic Regression', acc, f1, prec, rec]],
               columns = ['Model', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])

results

Unnamed: 0,Model,Accuracy,F1 Score,Precision,Recall
0,Logistic Regression,0.84058,0.84058,0.790909,0.896907


In [16]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[87 23]
 [10 87]]


### Cross-Validation

In [17]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=logreg, X=X_train, y=y_train, cv=10)

print("Accuracy is {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation is {:.2f} %".format(accuracies.std()*100))

Accuracy is 86.13 %
Standard Deviation is 5.71 %


## Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [19]:
y_pred = random_forest.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

model_results = pd.DataFrame([['Random Forest', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])


results = results.append(model_results, ignore_index = True)
results

Unnamed: 0,Model,Accuracy,F1 Score,Precision,Recall
0,Logistic Regression,0.84058,0.84058,0.790909,0.896907
1,Random Forest,0.855072,0.845361,0.845361,0.845361


In [20]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[95 15]
 [15 82]]


### Cross Validation

In [21]:
accuracies = cross_val_score(estimator=random_forest, X=X_train, y=y_train, cv=10)

print("Accuracy is {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation is {:.2f} %".format(accuracies.std()*100))

Accuracy is 86.32 %
Standard Deviation is 5.01 %


## XGBoost

In [22]:
from xgboost import XGBClassifier
classifier_xgb = XGBClassifier(random_state=42)
classifier_xgb.fit(X_train, y_train)

XGBClassifier(random_state=42)

In [23]:
y_pred = classifier_xgb.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

model_results = pd.DataFrame([['XGBoost', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])


results = results.append(model_results, ignore_index = True)
results

Unnamed: 0,Model,Accuracy,F1 Score,Precision,Recall
0,Logistic Regression,0.84058,0.84058,0.790909,0.896907
1,Random Forest,0.855072,0.845361,0.845361,0.845361
2,XGBoost,0.850242,0.844221,0.823529,0.865979


In [24]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[92 18]
 [13 84]]


### Cross Validation

In [25]:
accuracies = cross_val_score(estimator=classifier_xgb, X=X_train, y=y_train, cv=10)

print("Accuracy is {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation is {:.2f} %".format(accuracies.std()*100))

Accuracy is 85.50 %
Standard Deviation is 5.38 %


## KNN

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [27]:
y_pred = knn.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

model_results = pd.DataFrame([['Knn', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])


results = results.append(model_results, ignore_index = True)
results

Unnamed: 0,Model,Accuracy,F1 Score,Precision,Recall
0,Logistic Regression,0.84058,0.84058,0.790909,0.896907
1,Random Forest,0.855072,0.845361,0.845361,0.845361
2,XGBoost,0.850242,0.844221,0.823529,0.865979
3,Knn,0.874396,0.864583,0.873684,0.85567


### Cross-Validation

In [28]:
accuracies = cross_val_score(estimator=knn, X=X_train, y=y_train, cv=10)

print("Accuracy is {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation is {:.2f} %".format(accuracies.std()*100))

Accuracy is 85.29 %
Standard Deviation is 4.96 %


The Knn model has performed the best in terms of accuracy and F1 Score. On Cross-Validation it performed about as well as the other models.

## Decision Trees

In [29]:
# Use the default and check performance
from sklearn import tree

# Declare a variable called entr_model and use tree.DecisionTreeClassifier. 
tree = tree.DecisionTreeClassifier(random_state = 42)

# Call fit() on entr_model
tree.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [30]:
# Call predict() on entr_model with X_test passed to it, and assign the result to a variable y_pred 
y_pred = tree.predict(X_test)

# Call Series on our y_pred variable with the following: pd.Series(y_pred)
# y_pred = pd.Series(y_pred)

In [31]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

model_results = pd.DataFrame([['Decision Tree', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])


results = results.append(model_results, ignore_index = True)
results

Unnamed: 0,Model,Accuracy,F1 Score,Precision,Recall
0,Logistic Regression,0.84058,0.84058,0.790909,0.896907
1,Random Forest,0.855072,0.845361,0.845361,0.845361
2,XGBoost,0.850242,0.844221,0.823529,0.865979
3,Knn,0.874396,0.864583,0.873684,0.85567
4,Decision Tree,0.869565,0.850829,0.916667,0.793814


### Cross-Validation

In [32]:
accuracies = cross_val_score(estimator=tree, X=X_train, y=y_train, cv=10)

print("Accuracy is {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation is {:.2f} %".format(accuracies.std()*100))

Accuracy is 83.65 %
Standard Deviation is 4.27 %


## Model Decision

In terms of cross-validation: Logistic Regression and Random forest perform the best in that they have the highest scores. In terms of the F1 Score KNN performed the best out of all the models. We elect to go with the Random Forest Model. 

# Search to Find the Best Parameters (Random Forest)

In [33]:
from pprint import pprint

print('Parameters currently in use:\n')
pprint(random_forest.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


We will try to adjust the following set of hyperparameters:

* `n_estimators` = number of trees in the foreset
* `max_features` = max number of features considered for splitting a node
* `max_depth` = max number of levels in each decision tree
* `min_samples_split` = min number of data points placed in a node before the node is split
* `min_samples_leaf` = min number of data points allowed in a leaf node
* `bootstrap` = method for sampling data points (with or without replacement)

In [34]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [36]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=2)

In [37]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 8,
 'n_estimators': 200}

In [38]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=90, max_features=3, min_samples_leaf=5,
                       min_samples_split=8, n_estimators=200)

In [39]:
grid_search.best_score_

0.8778467908902692

In [40]:
clf = RandomForestClassifier(max_depth=90, max_features=3, min_samples_leaf=5,
                       min_samples_split=8, n_estimators=200)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=90, max_features=3, min_samples_leaf=5,
                       min_samples_split=8, n_estimators=200)

In [41]:
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

model_results = pd.DataFrame([['Final Random Forest', acc, f1, prec, rec]],
                       columns=['Model', "Accuracy", 'F1 Score', 'Precision', 'Recall'])
results = results.append(model_results, ignore_index=True)
results

Unnamed: 0,Model,Accuracy,F1 Score,Precision,Recall
0,Logistic Regression,0.84058,0.84058,0.790909,0.896907
1,Random Forest,0.855072,0.845361,0.845361,0.845361
2,XGBoost,0.850242,0.844221,0.823529,0.865979
3,Knn,0.874396,0.864583,0.873684,0.85567
4,Decision Tree,0.869565,0.850829,0.916667,0.793814
5,Final Random Forest,0.859903,0.849741,0.854167,0.845361


In [42]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[96 14]
 [15 82]]
