In [2]:
# import packages
import dmba
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pylab as plt
from dmba import classificationSummary, gainsChart

In [3]:
# load dataset
df = dmba.load_data('UniversalBank.csv')

In [4]:
df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [5]:
# drop unnecessary columns
df.drop(df.columns.difference(['Personal Loan','Online','CreditCard']),1,inplace = True)

In [6]:
df.head()

Unnamed: 0,Personal Loan,Online,CreditCard
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,1


In [7]:
# Convert columns Online and CreditCard to categories
df.Online = df.Online.astype('category')
df.CreditCard = df.CreditCard.astype('category')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Personal Loan  5000 non-null   int64   
 1   Online         5000 non-null   category
 2   CreditCard     5000 non-null   category
dtypes: category(2), int64(1)
memory usage: 49.1 KB


In [9]:
predictors = ['Online','CreditCard']
outcome = 'Personal Loan'

In [10]:
X = pd.get_dummies(df[predictors])
y = df['Personal Loan'].astype('category')
classes = list(y.cat.categories)

In [11]:
# partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [12]:
df.pivot_table(index = ['CreditCard','Personal Loan'], columns = 'Online', aggfunc = len)

Unnamed: 0_level_0,Online,0,1
CreditCard,Personal Loan,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,1300,1893
0,1,128,209
1,0,527,800
1,1,61,82


In [13]:
# Q2 the estimated probability of a customer with both online banking and a credit card is around 82/(800+82) = 9.3% likely to accept a personal loan offer

In [14]:
df.pivot_table(index = 'Personal Loan', columns = 'Online', aggfunc = len)

Unnamed: 0_level_0,CreditCard,CreditCard
Online,0,1
Personal Loan,Unnamed: 1_level_2,Unnamed: 2_level_2
0,1827,2693
1,189,291


In [15]:
df.pivot_table(index = 'Personal Loan', columns = 'CreditCard', aggfunc = len)

Unnamed: 0_level_0,Online,Online
CreditCard,0,1
Personal Loan,Unnamed: 1_level_2,Unnamed: 2_level_2
0,3193,1327
1,337,143


In [16]:
# Q3 Among those that accepted the loan offer, 291/(189+291) = 60.6% were already credit card users and 143/(143 + 337) = 29.8% were users of the online banking system.
# Overall, (189 + 291)/(189 + 291 + 1827 + 2693) = 9.6% of people accepted the loan offer.
# Among those that did not accept the loan offer, 2693/(1827 + 2693) = 59.6% were already credit card users and 1327/(1327 + 3193) = 29.4% were users of the online banking system
# Overall, (1827 + 2693)/(189 + 291 + 1827 + 2693) = 90.4% of people did not accept the loan offer.

In [17]:
# Q4 Using the formula [P(C_1)*P(x_1|C_1)*P(x_2|C_1)]/{[P(C_1)*P(x_1|C_1)*P(x_2|C_1)]+[P(C_2)*P(x_1|C_2)*P(x_2|C_2)]}
# we get [(480/5000)(291/480)(143/480)]/{[(480/5000)(291/480)(143/480)]+[(4520/5000)(2693/4520)(1327/4520)]}
# Perform some calculations and this becomes (.096*.606*.298)/[(.096*.606*.298)+(.904*.596*.294)] = .0173/(.0173+.158)
# The Naive Bayes Probability for P(Loan=1|CC=1,Online=1) = .0988

In [18]:
# Q5 The value derived in Q4 (.0988) should be a better estimate for future data, even though the value in Q2 fits the training set perfectly.

In [19]:
# run Naive Bayes
loan_nb = MultinomialNB(alpha=.01)
loan_nb.fit(X_train, y_train)

MultinomialNB(alpha=0.01)

In [20]:
# predict probabilities
predProb_train = loan_nb.predict_proba(X_train)
predProb_valid = loan_nb.predict_proba(X_valid)

In [21]:
# predict class membership
y_train_pred = loan_nb.predict(X_train)
y_valid_pred = loan_nb.predict(X_valid)

In [22]:
classificationSummary(y_train, y_train_pred, class_names=classes)
classificationSummary(y_valid, y_valid_pred, class_names=classes)

Confusion Matrix (Accuracy 0.9043)

       Prediction
Actual    0    1
     0 2713    0
     1  287    0
Confusion Matrix (Accuracy 0.9035)

       Prediction
Actual    0    1
     0 1807    0
     1  193    0


In [27]:
# Isolate data points with both online usage and credit cards
online_1_cc_1 = df.loc[(df['Online']==1)&(df['CreditCard']==1)]

In [28]:
online_1_cc_1.value_counts()

Personal Loan  Online  CreditCard
0              1       1             800
1              1       1              82
dtype: int64

In [29]:
# Q6 Out of the data points with both credit cards and online banking usage, 82/882 accepted the loan offer. 
# This makes the estimated probability 9.3% for future data meeting the same conditions

In [None]:
# Convert string columns to categories
df['Category'] = df['Category'].astype('category')
df['currency'] = df['currency'].astype('category')
df['endDay'] = df['endDay'].astype('category')