In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, mean_squared_error

In [2]:
%matplotlib inline

## Part 1: Data exploration and preprocessing

In [3]:


bank_df = pd.read_csv('UniversalBank_unprocessed.csv')
bank_df.head()

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard,Personal Loan
0,1,39,13.0,58.0,3,2.1,Undergraduate,169,0,1,0,0
1,2,51,25.0,18.0,1,0.3,Advanced,93,0,0,1,0
2,3,43,13.0,38.0,3,2.0,Advanced,0,0,1,0,0
3,4,37,12.0,60.0,4,2.1,Advanced,217,0,1,0,0
4,5,23,,149.0,1,6.33,Undergraduate,305,0,0,1,0


In [4]:


predictors_df = bank_df[['ID', 'Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'CD Account', 'Online', 'CreditCard']]
response_df = bank_df['Personal Loan']

predictors_df.shape, response_df.shape

((1117, 11), (1117,))

In [5]:

predictors_df.dtypes, response_df.dtypes

(ID              int64
 Age             int64
 Experience    float64
 Income        float64
 Family          int64
 CCAvg         float64
 Education      object
 Mortgage        int64
 CD Account      int64
 Online          int64
 CreditCard      int64
 dtype: object,
 dtype('int64'))

In [6]:


predictors_df['Education'].value_counts()

Undergraduate    389
Advanced         383
Masters          345
Name: Education, dtype: int64

In [7]:


predictors_df = pd.get_dummies(predictors_df, drop_first = True)
predictors_df.head()

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Masters,Education_Undergraduate
0,1,39,13.0,58.0,3,2.1,169,0,1,0,0,1
1,2,51,25.0,18.0,1,0.3,93,0,0,1,0,0
2,3,43,13.0,38.0,3,2.0,0,0,1,0,0,0
3,4,37,12.0,60.0,4,2.1,217,0,1,0,0,0
4,5,23,,149.0,1,6.33,305,0,0,1,0,1


In [8]:


response_df.value_counts()

0    637
1    480
Name: Personal Loan, dtype: int64

In [9]:
# check for null values

predictors_df.isnull().sum()

ID                         0
Age                        0
Experience                 4
Income                     3
Family                     0
CCAvg                      0
Mortgage                   0
CD Account                 0
Online                     0
CreditCard                 0
Education_Masters          0
Education_Undergraduate    0
dtype: int64

In [10]:


imputer = KNNImputer(n_neighbors=5)
predictors_df = pd.DataFrame(imputer.fit_transform(predictors_df), columns = predictors_df.columns)
predictors_df

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Masters,Education_Undergraduate
0,1.0,39.0,13.0,58.0,3.0,2.10,169.0,0.0,1.0,0.0,0.0,1.0
1,2.0,51.0,25.0,18.0,1.0,0.30,93.0,0.0,0.0,1.0,0.0,0.0
2,3.0,43.0,13.0,38.0,3.0,2.00,0.0,0.0,1.0,0.0,0.0,0.0
3,4.0,37.0,12.0,60.0,4.0,2.10,217.0,0.0,1.0,0.0,0.0,0.0
4,5.0,23.0,15.6,149.0,1.0,6.33,305.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1112,1113.0,37.0,12.0,123.0,4.0,3.10,253.0,1.0,1.0,1.0,1.0,0.0
1113,1114.0,37.0,13.0,158.0,2.0,2.30,0.0,1.0,1.0,1.0,1.0,0.0
1114,1115.0,53.0,29.0,120.0,4.0,2.70,111.0,1.0,1.0,0.0,1.0,0.0
1115,1116.0,26.0,0.0,179.0,4.0,2.10,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
# correlation between predictors

predictors_df.corr()

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Masters,Education_Undergraduate
ID,1.0,-0.006488,-0.009373,0.518479,0.05014,0.338873,0.172488,0.250825,0.015341,-0.002207,0.079664,-0.209022
Age,-0.006488,1.0,0.991671,-0.044972,-0.046942,-0.036068,0.020851,0.024537,0.034029,0.024464,-0.02332,-0.011692
Experience,-0.009373,0.991671,1.0,-0.042488,-0.054421,-0.039782,0.025669,0.025503,0.02884,0.033727,-0.025966,0.010661
Income,0.518479,-0.044972,-0.042488,1.0,-0.047878,0.629477,0.250952,0.258391,0.016562,-0.015018,0.015076,-0.060239
Family,0.05014,-0.046942,-0.054421,-0.047878,1.0,-0.012103,0.029502,0.025519,0.024708,0.012454,0.065431,-0.020269
CCAvg,0.338873,-0.036068,-0.039782,0.629477,-0.012103,1.0,0.131017,0.197364,0.011636,0.00341,0.010219,-0.038577
Mortgage,0.172488,0.020851,0.025669,0.250952,0.029502,0.131017,1.0,0.14219,0.00424,0.037236,-0.024447,0.006359
CD Account,0.250825,0.024537,0.025503,0.258391,0.025519,0.197364,0.14219,1.0,0.26287,0.377198,0.063891,-0.088806
Online,0.015341,0.034029,0.02884,0.016562,0.024708,0.011636,0.00424,0.26287,1.0,0.002696,0.09279,-0.046164
CreditCard,-0.002207,0.024464,0.033727,-0.015018,0.012454,0.00341,0.037236,0.377198,0.002696,1.0,0.028476,-0.054577


In [12]:
# drop predictors
# here we are dropping Experience, as experience is highly correlated with the age.

predictors_df = predictors_df.drop(['ID', 'Experience'], axis = 1)
predictors_df.head()

Unnamed: 0,Age,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Masters,Education_Undergraduate
0,39.0,58.0,3.0,2.1,169.0,0.0,1.0,0.0,0.0,1.0
1,51.0,18.0,1.0,0.3,93.0,0.0,0.0,1.0,0.0,0.0
2,43.0,38.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0
3,37.0,60.0,4.0,2.1,217.0,0.0,1.0,0.0,0.0,0.0
4,23.0,149.0,1.0,6.33,305.0,0.0,0.0,1.0,0.0,1.0


## Part 2: $k$-NN

In [13]:
# partitioning data into train and test sets

X_classifier = predictors_df
y_classifier = response_df
train_X_classifier, test_X_classifier, train_y_classifier, test_y_classifier = train_test_split(X_classifier, 
                                                            y_classifier, test_size=0.3, random_state=61)

In [14]:
# normalizing predictors in auctions data using standardization

z_score_norm1 = preprocessing.StandardScaler()
z_score_norm1.fit(train_X_classifier)
train_X_classifier = pd.DataFrame(z_score_norm1.transform(train_X_classifier), 
                                          columns = predictors_df.columns)
test_X_classifier = pd.DataFrame(z_score_norm1.transform(test_X_classifier), 
                                          columns = predictors_df.columns)

test_X_classifier

Unnamed: 0,Age,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Masters,Education_Undergraduate
0,-1.510105,-1.223055,0.430130,-0.778844,-0.559351,-0.423979,-1.249262,-0.640115,1.444630,-0.696269
1,0.935679,-0.454431,0.430130,-0.492155,-0.559351,-0.423979,0.800473,1.562219,-0.692219,-0.696269
2,-0.461912,-1.076650,-0.433447,-0.205466,0.295048,-0.423979,0.800473,-0.640115,-0.692219,-0.696269
3,-0.112514,-1.003448,0.430130,-0.301029,0.877955,2.358607,0.800473,1.562219,-0.692219,-0.696269
4,-1.248056,-0.491032,0.430130,-0.492155,-0.559351,-0.423979,0.800473,-0.640115,-0.692219,1.436226
...,...,...,...,...,...,...,...,...,...,...
331,0.498932,-1.589067,0.430130,-1.065533,-0.559351,2.358607,0.800473,1.562219,-0.692219,1.436226
332,-1.073358,0.387396,-1.297024,0.272348,-0.559351,-0.423979,0.800473,-0.640115,-0.692219,-0.696269
333,0.324233,1.137719,0.430130,1.849137,3.313392,2.358607,-1.249262,-0.640115,1.444630,-0.696269
334,1.634474,0.076286,-1.297024,-0.874407,-0.559351,-0.423979,0.800473,1.562219,-0.692219,-0.696269


In [15]:
# training the k-NN model and look at performance on training data

knn = KNeighborsClassifier(n_neighbors=5).fit(train_X_classifier, train_y_classifier)
predicted_y_training = knn.predict(train_X_classifier)

print("Root Mean Squared Error (RMSE): ", round(mean_squared_error(train_y_classifier, predicted_y_training) ** 0.5, 4))
print("F1 Score: ", f1_score(train_y_classifier, predicted_y_training))

Root Mean Squared Error (RMSE):  0.2319
F1 Score:  0.9373134328358209


In [16]:
# performance of k-NN on test data

predicted_y_test = knn.predict(test_X_classifier)

print("Root Mean Squared Error (RMSE): ", round(mean_squared_error(test_y_classifier, predicted_y_test) ** 0.5, 4))
print("F1 Score: ",f1_score(test_y_classifier, predicted_y_test))

Root Mean Squared Error (RMSE):  0.2559
F1 Score:  0.9185185185185185


In [17]:
# training a classifier for different values of k

results = []
for k in range(1, 20):
    knn2 = KNeighborsClassifier(n_neighbors=k).fit(train_X_classifier, train_y_classifier)
    results.append({
        'k': k,
        'f1_score': f1_score(test_y_classifier, knn2.predict(test_X_classifier))
    })

# Convert results to a pandas data frame
results = pd.DataFrame(results)
print(results)

     k  f1_score
0    1  0.907749
1    2  0.870968
2    3  0.912409
3    4  0.895753
4    5  0.918519
5    6  0.909091
6    7  0.909091
7    8  0.917910
8    9  0.918519
9   10  0.920755
10  11  0.933333
11  12  0.920755
12  13  0.933333
13  14  0.920755
14  15  0.929368
15  16  0.913208
16  17  0.929368
17  18  0.921348
18  19  0.925373


## Part 3: Logistic regression and model comparison

In [18]:
# training the LR model and look at performance on train data

logistic_model = LogisticRegression()
logistic_model = logistic_model.fit(train_X_classifier, train_y_classifier)
predicted_y_training2 = logistic_model.predict(train_X_classifier)

print("Root Mean Squared Error (RMSE): ", round(mean_squared_error(train_y_classifier, predicted_y_training2) ** 0.5, 4))
print("F1 Score: ",f1_score(train_y_classifier, predicted_y_training2))

Root Mean Squared Error (RMSE):  0.3318
F1 Score:  0.8735294117647059


In [19]:
# performance of LR on test data

predicted_y_test2 = logistic_model.predict(test_X_classifier)

print("Root Mean Squared Error (RMSE): ", round(mean_squared_error(test_y_classifier, predicted_y_test2) ** 0.5, 4))
print("F1 Score: ",f1_score(test_y_classifier, predicted_y_test2))

Root Mean Squared Error (RMSE):  0.3181
F1 Score:  0.8794326241134751
