# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

# Load Data

In [2]:
cc_apps = pd.read_csv('data_clean.csv')
df = pd.DataFrame(cc_apps)

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
male,b,a,a,b,b
age,30.83,58.67,24.5,27.83,20.17
debt,0.0,4.46,0.5,1.54,5.625
married,u,u,u,u,u
bank_customer,g,g,g,g,g
education_level,w,q,q,w,w
ethnicity,v,h,h,v,v
years_employed,1.25,3.04,1.5,3.75,1.71
prior_default,t,t,t,t,t
employed,t,t,f,t,f


The missing values were successfully handled in a prior notebook.

There is still a little but essential pre-processing to perform before we start building our machine learning model. The tasks include

1. Converting the non-numeric data to numeric.
2. Split the data into test and training sets.
3. Scale the features to a uniform range.

We being by converting all of the non-numeric data into numeric. Many machine learning models require the data to be in stricly numeric format. The will also result in faster computations. We will use _label encoding_ to accomplish this task.

# Convert non-numeric data to numeric

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in df:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

# Split the data into test ansd training sets

Now, we will split our data into train set and test set to prepare our data for two different phases of machine learning modeling: training and testing. 

Moreover, features like `drivers_license` and `zip_code` are not as important as the other features in the dataset for predicting credit card approvals. We should drop them to design our machine learning model with the best set of features.

In [5]:
df = df.drop(['drivers_license', 'zip_code'], axis=1)
df = df.values

X,y = df[:, 0:13], df[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
X_train.shape, X_test.shape

((483, 13), (207, 13))

In [7]:
y_train.shape, y_test.shape

((483,), (207,))

# Scale the features to a uniform range

The data is now split into two separate sets — train and test sets respectively. We are only left with one final pre-processing step of scaling before we can fit a machine learning model to the data.

Now, let’s try to understand what these scaled values mean in the real world. Let’s use `credit_score` as an example. The credit score of a person is their credit worthiness based on their credit history. The higher this number, the more financially trustworthy a person is considered to be. So, a `credit_score` of 1 is the highest since we're rescaling all the values to the range of 0-1.

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)