In [34]:
# import packages
import dmba
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
import matplotlib.pylab as plt

In [35]:
# load dataset
df = dmba.load_data('UniversalBank.csv')

In [36]:
df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [37]:
# drop unnecessary columns
df = df.drop(['ID', 'ZIP Code'], axis = 1)

In [38]:
df.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [39]:
# partition the data
trainData, validData = train_test_split(df, test_size = 0.4, random_state = 1)

In [40]:
scaler = preprocessing.StandardScaler()

In [41]:
scaler.fit(trainData[['Age', 'Experience','Income','Family','CCAvg','Education','Mortgage','Securities Account','CD Account','Online','CreditCard']])

StandardScaler()

In [42]:
loanNorm = pd.concat([pd.DataFrame(scaler.transform(df[['Age', 'Experience','Income','Family','CCAvg','Education','Mortgage','Securities Account','CD Account','Online','CreditCard']]),
                                  columns = ['zAge', 'zExperience','zIncome','zFamily','zCCAvg','zEducation','zMortgage','zSecurities Account','zCD Account','zOnline','zCreditCard']),
                     df[['Personal Loan']]], axis = 1)

In [43]:
trainNorm = loanNorm.iloc[trainData.index]
validNorm = loanNorm.iloc[validData.index]

In [44]:
train_X = trainNorm[['zAge', 'zExperience','zIncome','zFamily','zCCAvg','zEducation','zMortgage','zSecurities Account','zCD Account','zOnline','zCreditCard']]
train_Y = trainNorm[['Personal Loan']]
valid_X = validNorm[['zAge', 'zExperience','zIncome','zFamily','zCCAvg','zEducation','zMortgage','zSecurities Account','zCD Account','zOnline','zCreditCard']]
valid_Y = validNorm[['Personal Loan']]

In [45]:
# finding best k
results = []
for k in [1,3,5,7,9,11,13,15,17,19]:
    knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_Y)
    results.append({
        'k': k,
        'accuracy': accuracy_score(valid_Y, knn.predict(valid_X))
    })
    
results = pd.DataFrame(results)
results

# Q1: k = 5 seems to be the best choice here! However, k = 1, 3, 7, 9 also return similar accuracy within less than a percent difference

  knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_Y)
  knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_Y)
  knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_Y)
  knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_Y)
  knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_Y)
  knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_Y)
  knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_Y)
  knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_Y)
  knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_Y)
  knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_Y)


Unnamed: 0,k,accuracy
0,1,0.9555
1,3,0.9545
2,5,0.9575
3,7,0.9565
4,9,0.952
5,11,0.947
6,13,0.945
7,15,0.9445
8,17,0.942
9,19,0.9425


In [46]:
loan_X = loanNorm[['zAge', 'zExperience','zIncome','zFamily','zCCAvg','zEducation','zMortgage','zSecurities Account','zCD Account','zOnline','zCreditCard']]
loan_Y = loanNorm[['Personal Loan']]

In [48]:
knn = KNeighborsClassifier(n_neighbors = 5).fit(loan_X, loan_Y)

  knn = KNeighborsClassifier(n_neighbors = 5).fit(loan_X, loan_Y)


In [50]:
from dmba import classificationSummary
classificationSummary(valid_Y, knn.predict(valid_X))

Confusion Matrix (Accuracy 0.9695)

       Prediction
Actual    0    1
     0 1805    2
     1   59  134


In [51]:
# Q2 Since the important class in this case is those that accept the loan, the sensitivity would be calculated by dividing those that were correctly predicted to accept the loan over the total count of those that accepted the loan
# This gives us 134/(134 + 59) = 69.4%
# Specificity is calculated in the same way but for those that did not accept the loan instead
# This gives us 1805/(1805 + 2) = 99.9%

In [52]:
newCustomer = pd.DataFrame([{'Age':40, 'Experience':10, 'Income':84, 'Family':2, 'CCAvg':2, 'Education':2, 'Mortgage':0, 'Securities Account':0, 'CD Account':0, 'Online':1, 'CreditCard':1}])

In [53]:
newCustomerNorm = pd.DataFrame(scaler.transform(newCustomer),
                              columns = ['zAge', 'zExperience','zIncome','zFamily','zCCAvg','zEducation','zMortgage','zSecurities Account','zCD Account','zOnline','zCreditCard'])

In [54]:
distances, indices = knn.kneighbors(newCustomerNorm)

In [56]:
print(knn.predict(newCustomerNorm))
print('Distances', distances)
print('Indices', indices)
print(loanNorm.iloc[indices[0], :])
# Q3 under this model, the new customer is predicted to not accept a personal loan

[0]
Distances [[0.47859833 0.49507362 0.63219765 0.70542183 0.83571448]]
Indices [[4034 4407 3398 1630 4127]]
          zAge  zExperience   zIncome   zFamily    zCCAvg  zEducation  \
4034 -0.922251    -0.813928  0.177728 -0.352127 -0.136574    0.129806   
4407 -0.747929    -0.639658 -0.059674 -0.352127 -0.136574    0.129806   
3398 -0.486446    -0.552523 -0.253912 -0.352127  0.265373    0.129806   
1630 -0.399285    -0.291118  0.544622 -0.352127 -0.079153    0.129806   
4127 -0.224963    -0.116848  0.177728 -0.352127 -0.079153    0.129806   

      zMortgage  zSecurities Account  zCD Account  zOnline  zCreditCard  \
4034  -0.559242            -0.337025    -0.252646  0.83419      1.53728   
4407  -0.559242            -0.337025    -0.252646  0.83419      1.53728   
3398  -0.559242            -0.337025    -0.252646  0.83419      1.53728   
1630  -0.559242            -0.337025    -0.252646  0.83419      1.53728   
4127  -0.559242            -0.337025    -0.252646  0.83419      1.53728   

