# Part 1:

## Importing Libraries and Dataset

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, confusion_matrix, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression

In [2]:
df = pd.read_csv('data_clean.csv')
df.head()

Unnamed: 0,male,age,debt,married,bank_customer,education_level,ethnicity,years_employed,prior_default,employed,credit_score,drivers_license,citizen,zip_code,income,approval_status
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             690 non-null    object 
 1   age              690 non-null    float64
 2   debt             690 non-null    float64
 3   married          690 non-null    object 
 4   bank_customer    690 non-null    object 
 5   education_level  690 non-null    object 
 6   ethnicity        690 non-null    object 
 7   years_employed   690 non-null    float64
 8   prior_default    690 non-null    object 
 9   employed         690 non-null    object 
 10  credit_score     690 non-null    int64  
 11  drivers_license  690 non-null    object 
 12  citizen          690 non-null    object 
 13  zip_code         690 non-null    object 
 14  income           690 non-null    int64  
 15  approval_status  690 non-null    int64  
dtypes: float64(3), int64(3), object(10)
memory usage: 86.4+ KB


In [4]:
missing = pd.concat([df.sum().isnull(), 100 * df.isnull().mean()], axis=1)
missing.columns = ['count', '%']
missing.sort_values(by = '%')

Unnamed: 0,count,%
male,False,0.0
age,False,0.0
debt,False,0.0
married,False,0.0
bank_customer,False,0.0
education_level,False,0.0
ethnicity,False,0.0
years_employed,False,0.0
prior_default,False,0.0
employed,False,0.0


In [5]:
for col in df:
    print(f'{col}: {df[col].unique()}', '\n')

male: ['b' 'a'] 

age: [30.83       58.67       24.5        27.83       20.17       32.08
 33.17       22.92       54.42       42.5        22.08       29.92
 38.25       48.08       45.83       36.67       28.25       23.25
 21.83       19.17       25.         47.75       27.42       41.17
 15.83       47.         56.58       57.42       42.08       29.25
 42.         49.5        36.75       22.58       27.25       23.
 27.75       54.58       34.17       28.92       29.67       39.58
 56.42       54.33       41.         31.92       41.5        23.92
 25.75       26.         37.42       34.92       34.25       23.33
 23.17       44.33       35.17       43.25       56.75       31.67
 23.42       20.42       26.67       36.         25.5        19.42
 32.33       34.83       38.58       44.25       44.83       20.67
 34.08       21.67       21.5        49.58       27.67       39.83
 31.56817109 37.17       25.67       34.         49.         62.5
 31.42       52.33       28.75       28.58

The missing values were successfully handled in a prior notebook.

There is still a little but essential pre-processing to perform before we start building our machine learning model. The tasks include

1. Convert the non-numeric data into numeric. 
2. Split the data into test and training sets.
3. Scale the features to a uniform range.

We being by converting all of the non-numeric data into numeric. Many machine learning models require the data to be in stricly numeric format. The will also result in faster computations. We will use _label encoding_ to accomplish this task.

## Convert the non-numeric data into numeric.

In [6]:
# Instantiate LabelEncoder
le = LabelEncoder()

for col in df:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             690 non-null    int64  
 1   age              690 non-null    float64
 2   debt             690 non-null    float64
 3   married          690 non-null    int64  
 4   bank_customer    690 non-null    int64  
 5   education_level  690 non-null    int64  
 6   ethnicity        690 non-null    int64  
 7   years_employed   690 non-null    float64
 8   prior_default    690 non-null    int64  
 9   employed         690 non-null    int64  
 10  credit_score     690 non-null    int64  
 11  drivers_license  690 non-null    int64  
 12  citizen          690 non-null    int64  
 13  zip_code         690 non-null    int64  
 14  income           690 non-null    int64  
 15  approval_status  690 non-null    int64  
dtypes: float64(3), int64(13)
memory usage: 86.4 KB


## Spliting the data into test and training sets

Now, we will split our data into train set and test set to prepare our data for two different phases of machine learning modeling: training and testing. 

Moreover, features like `drivers_license` and `zip_code` are not as important as the other features in the dataset for predicting credit card approvals. We should drop them to design our machine learning model with the best set of features.

In [8]:
# Drop the drivers_license and zip_code features
df = df.drop(['drivers_license', 'zip_code'], axis = 1)
df = df.values

# Segregate features and labels into seperate variables
X, y = df[:,0:13], df[:,-1]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

Now, we are only left with one final preprocessing step of scaling before we can fit a machine learning model to the data.

## Scale the features to a uniform range

The data is now split into two separate sets — train and test sets respectively. We are only left with one final pre-processing step of scaling before we can fit a machine learning model to the data.

Now, let’s try to understand what these scaled values mean in the real world. Let’s use `credit_score` as an example. The credit score of a person is their credit worthiness based on their credit history. The higher this number, the more financially trustworthy a person is considered to be. So, a `credit_score` of 1 is the highest since we're rescaling all the values to the range of 0-1.

In [9]:
# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
X_train

array([[0.00000000e+00, 7.41269841e-02, 3.51243592e-01, ...,
        5.97014925e-02, 0.00000000e+00, 5.40000000e-03],
       [1.00000000e+00, 5.28571429e-02, 3.22764382e-03, ...,
        0.00000000e+00, 0.00000000e+00, 7.22000000e-03],
       [1.00000000e+00, 1.38888889e-01, 4.36681223e-01, ...,
        0.00000000e+00, 0.00000000e+00, 4.00000000e-02],
       ...,
       [1.00000000e+00, 3.78253968e-01, 0.00000000e+00, ...,
        0.00000000e+00, 5.00000000e-01, 0.00000000e+00],
       [1.00000000e+00, 8.33333333e-02, 0.00000000e+00, ...,
        5.97014925e-02, 0.00000000e+00, 1.00000000e-05],
       [1.00000000e+00, 7.80952381e-02, 1.89861401e-01, ...,
        2.98507463e-02, 0.00000000e+00, 3.80000000e-04]])

In [11]:
X_test

array([[0.        , 0.28282811, 0.05695842, ..., 0.02985075, 0.        ,
        0.00105   ],
       [0.        , 0.51190476, 0.15188912, ..., 0.        , 0.        ,
        0.0096    ],
       [1.        , 0.09920635, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.25666667, 0.06967913, ..., 0.        , 0.        ,
        0.002     ],
       [0.        , 0.37428571, 0.0949307 , ..., 0.        , 0.        ,
        0.00246   ],
       [0.        , 0.14412698, 0.08695652, ..., 0.10447761, 0.        ,
        0.02384   ]])

## Decision Trees

### Entropy Model - No max_depth

In [32]:
# Use the default and check performance

from sklearn import tree

# Declare a variable called entr_model and use tree.DecisionTreeClassifier. 
entr_model = tree.DecisionTreeClassifier(criterion="entropy", random_state = 42)

# Call fit() on entr_model
entr_model.fit(X_train, y_train)

# Call predict() on entr_model with X_test passed to it, and assign the result to a variable y_pred 
y_pred = entr_model.predict(X_test)

# Call Series on our y_pred variable with the following: pd.Series(y_pred)
y_pred = pd.Series(y_pred)

# Check out entr_model
entr_model

DecisionTreeClassifier(criterion='entropy', random_state=42)

In [36]:
print("Model Entropy - no max depth")
print("Accuracy:", accuracy_score(y_test,y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_test,y_pred))
print('Precision score for "Approved"' , precision_score(y_test,y_pred, pos_label = 1))
print('Precision score for "Disapproved"' , precision_score(y_test,y_pred, pos_label = 0))
print('Recall score for "Approved"' , recall_score(y_test,y_pred, pos_label = 1))
print('Recall score for "Disapproved"' , recall_score(y_test,y_pred, pos_label = 0))

Model Entropy - no max depth
Accuracy: 0.8164251207729468
Balanced accuracy: 0.813261480787254
Precision score for "Approved" 0.8314606741573034
Precision score for "Disapproved" 0.8050847457627118
Recall score for "Approved" 0.7628865979381443
Recall score for "Disapproved" 0.8636363636363636


### Gini Impurity Model - no max_depth

In [37]:
# Make a variable called gini_model, and assign it exactly what you assigned entr_model with above, but with the
# criterion changed to 'gini'
gini_model = tree.DecisionTreeClassifier(criterion="gini", random_state = 42)

# Call fit() on the gini_model as you did with the entr_model
gini_model.fit(X_train, y_train)

# Call predict() on the gini_model as you did with the entr_model 
y_pred = gini_model.predict(X_test)

# Turn y_pred into a series, as before
y_pred = pd.Series(y_pred)

# Check out gini_model
gini_model

DecisionTreeClassifier(random_state=42)

In [38]:
print("Model Gini impurity model - no max depth")
print("Accuracy:", accuracy_score(y_test,y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_test,y_pred))
print('Precision score for "Approved"' , precision_score(y_test,y_pred, pos_label = 1))
print('Precision score for "Disapproved"' , precision_score(y_test,y_pred, pos_label = 0))
print('Recall score for "Approved"' , recall_score(y_test,y_pred, pos_label = 1))
print('Recall score for "Disapproved"' , recall_score(y_test,y_pred, pos_label = 0))

Model Gini impurity model - no max depth
Accuracy: 0.8695652173913043
Balanced accuracy: 0.8650890346766635
Precision score for "Approved" 0.9166666666666666
Precision score for "Disapproved" 0.8373983739837398
Recall score for "Approved" 0.7938144329896907
Recall score for "Disapproved" 0.9363636363636364


### Entropy Model - max_depth 3 

In [39]:
entr_model2 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=1234)
entr_model2.fit(X_train, y_train)
y_pred = entr_model2.predict(X_test)
y_pred = pd.Series(y_pred)
entr_model2

DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=1234)

In [40]:
print("Model Entropy model - max depth 3")
print("Accuracy:", accuracy_score(y_test,y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_test,y_pred))
print('Precision score for "Approved"' , precision_score(y_test,y_pred, pos_label = 1))
print('Precision score for "Disapproved"' , precision_score(y_test,y_pred, pos_label = 0))
print('Recall score for "Approved"' , recall_score(y_test,y_pred, pos_label = 1))
print('Recall score for "Disapproved"' , recall_score(y_test,y_pred, pos_label = 0))

Model Entropy model - max depth 3
Accuracy: 0.8405797101449275
Balanced accuracy: 0.8451265229615745
Precision score for "Approved" 0.7807017543859649
Precision score for "Disapproved" 0.9139784946236559
Recall score for "Approved" 0.9175257731958762
Recall score for "Disapproved" 0.7727272727272727


### Gini impurity model - max_depth 3

In [41]:
gini_model2 = tree.DecisionTreeClassifier(criterion ='gini', random_state = 1234, max_depth  = 3 )

 
gini_model2.fit(X_train, y_train)
y_pred = gini_model2.predict(X_test)
y_pred = pd.Series(y_pred)
gini_model2

DecisionTreeClassifier(max_depth=3, random_state=1234)

In [42]:
print("Model Entropy model - max depth 3")
print("Accuracy:", accuracy_score(y_test,y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_test,y_pred))
print('Precision score for "Approved"' , precision_score(y_test,y_pred, pos_label = 1))
print('Precision score for "Disapproved"' , precision_score(y_test,y_pred, pos_label = 0))
print('Recall score for "Approved"' , recall_score(y_test,y_pred, pos_label = 1))
print('Recall score for "Disapproved"' , recall_score(y_test,y_pred, pos_label = 0))

Model Entropy model - max depth 3
Accuracy: 0.8454106280193237
Balanced accuracy: 0.8502811621368322
Precision score for "Approved" 0.782608695652174
Precision score for "Disapproved" 0.9239130434782609
Recall score for "Approved" 0.9278350515463918
Recall score for "Disapproved" 0.7727272727272727


 - Use the default arguments when building the models
 - features importance plots (use guided capstone)
 - Finish the models and time series.