### NAIVE BAYES

#### Importing Modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#### Loading Dataset

In [2]:
df=pd.read_csv('F:\Project\Credit_card.csv')
df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5


#### Selection of featured attributes

In [3]:
#We dropped the "expenditure" attribute because expenditure of every card column(No) is "0".
#So, mean and variance of "No" will be "0".No_posterior will be always zero.
#That will cause a wrong prediction.
cd=df[['card','reports','age','share','income']]
cd.head()

Unnamed: 0,card,reports,age,share,income
0,yes,0,37.66667,0.03327,4.52
1,yes,0,33.25,0.005217,2.42
2,yes,0,33.66667,0.004156,4.5
3,yes,0,30.5,0.065214,2.54
4,yes,0,32.16667,0.067051,9.7867


#### Encoding of Categorical Data

In [4]:
cd.card.replace(['yes','no'], ['1', '0'], inplace=True)
cd.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0,card,reports,age,share,income
0,1,0,37.66667,0.03327,4.52
1,1,0,33.25,0.005217,2.42
2,1,0,33.66667,0.004156,4.5
3,1,0,30.5,0.065214,2.54
4,1,0,32.16667,0.067051,9.7867


#### Splitting of Dataset

In [5]:
#Splitted the data in 7:3 ratio to x_train:x_test
x_train,x_test,y_train,y_test=train_test_split(cd.drop('card',axis=1),cd['card'],test_size=0.3,random_state=10)

#### Observation of Train and Test dataset

In [6]:
print ("No. of observation for training dataset: ",len(x_train))
print ("No. of observation for test dataset: ",len(x_test))

No. of observation for training dataset:  923
No. of observation for test dataset:  396


In [7]:
#We added the "card" attribute to x_train so that we can operate group by to calculate mean and variance
x_train['card']=y_train

In [8]:
#Mean
#Grouping on the basis of "card" column
x_train_means = x_train.groupby("card").mean()
x_train_means

Unnamed: 0_level_0,reports,age,share,income
card,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.62201,33.202552,0.000484,3.055508
1,0.121849,33.34127,0.091884,3.428652


In [9]:
#Variance
#Grouping on the basis of "card" column
x_train_var = x_train.groupby("card").var()
x_train_var

Unnamed: 0_level_0,reports,age,share,income
card,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6.005475,100.162036,5.241269e-08,2.721236
1,0.163253,108.301849,0.01101195,3.004883


#### Counting

In [10]:
n_yes = x_train['card'][x_train['card'] == '1'].count()
n_no = x_train['card'][x_train['card'] == '0'].count()
total = x_train['card'].count()
print ('No. of yes = {},no = {} and total = {}'.format(n_yes,n_no,total))

No. of yes = 714,no = 209 and total = 923


#### Prior values

In [11]:
p_yes = n_yes/total
p_no= n_no/total
print("Prior values are P(yes) = {} and p(no) = {}".format(p_yes,p_no))

Prior values are P(yes) = 0.7735644637053087 and p(no) = 0.22643553629469124


#### Mean

In [12]:
#Mean of "yes" output in "card" column
yes_reports_mean = x_train_means['reports'][x_train_means.index =="1"].values[0]
yes_age_mean = x_train_means['age'][x_train_means.index =="1"].values[0]
yes_share_mean = x_train_means['share'][x_train_means.index =="1"].values[0]
yes_inc_mean = x_train_means['income'][x_train_means.index =="1"].values[0]
print(yes_reports_mean,yes_age_mean,yes_share_mean,yes_inc_mean)

0.12184873949579832 33.34126979915964 0.09188381567086842 3.4286516806722664


In [13]:
#Mean of "no" output in "card" column
no_reports_mean = x_train_means['reports'][x_train_means.index =="0"].values[0]
no_age_mean = x_train_means['age'][x_train_means.index =="0"].values[0]
no_share_mean = x_train_means['share'][x_train_means.index =="0"].values[0]
no_inc_mean = x_train_means['income'][x_train_means.index =="0"].values[0]
print(no_reports_mean,no_age_mean,no_share_mean,no_inc_mean)

1.6220095693779903 33.20255186602873 0.0004839766985645932 3.055507655502396


#### Variance

In [14]:
#Variance of "yes" output in "card" column
yes_reports_var = x_train_var['reports'][x_train_var.index =="1"].values[0]
yes_age_var = x_train_var['age'][x_train_var.index =="1"].values[0]
yes_share_var = x_train_var['share'][x_train_var.index =="1"].values[0]
yes_inc_var = x_train_var['income'][x_train_var.index =="1"].values[0]
print(yes_reports_var,yes_age_var,yes_share_var,yes_inc_var)

0.16325267835044258 108.30184895461142 0.011011952281867491 3.0048826922622505


In [15]:
#Variance of "no" output in "card" column
no_reports_var = x_train_var['reports'][x_train_var.index =="0"].values[0]
no_age_var = x_train_var['age'][x_train_var.index =="0"].values[0]
no_share_var = x_train_var['share'][x_train_var.index =="0"].values[0]
no_inc_var = x_train_var['income'][x_train_var.index =="0"].values[0]
print(no_reports_var,no_age_var,no_share_var,no_inc_var)

6.005474788369527 100.16203565005569 5.2412691228567346e-08 2.721236012633421


#### Function

In [16]:
# create a function that calculates p(x/y)
def p_x_given_y(x,mean_y,variance_y):
    #input the argument into a probability density function
    p= 1/(np.sqrt(2*np.pi*variance_y))*np.exp((-(x-mean_y)**2)/(2*variance_y))
    # returning the value in p
    return p

In [17]:
col=['reports','age','share','income']
user1=pd.DataFrame(columns=col)
user1

Unnamed: 0,reports,age,share,income


In [18]:
#Re-indexing of x_test so that we can run a loop
x_test=x_test.append(user1,ignore_index=True)
x_test.head()

Unnamed: 0,reports,age,share,income
0,0,34.33333,0.474508,4.8
1,0,36.41667,0.011455,2.2
2,0,29.33333,0.000343,3.5
3,1,28.5,0.0006,2.0
4,1,20.91667,0.000457,2.625


In [19]:
# Creation of an list to store the predicted output
preds=[]
preds

[]

#### Calculation of Posterior values

In [20]:
for i in range(396):
    yes_posterior = p_yes * \
    p_x_given_y(x_test['reports'][i],yes_reports_mean,yes_reports_var)*\
    p_x_given_y(x_test['age'][i],yes_age_mean,yes_age_var)*\
    p_x_given_y(x_test['share'][i],yes_share_mean,yes_share_var)*\
    p_x_given_y(x_test['income'][i],yes_inc_mean,yes_inc_var)
    
    no_posterior = p_no * \
    p_x_given_y(x_test['reports'][i],no_reports_mean,no_reports_var)*\
    p_x_given_y(x_test['age'][i],no_age_mean,no_age_var)*\
    p_x_given_y(x_test['share'][i],no_share_mean,no_share_var)*\
    p_x_given_y(x_test['income'][i],no_inc_mean,no_inc_var)
     
    if (no_posterior > yes_posterior):
        preds.append('0')
    else:
        preds.append('1')

#### Predicted Array

In [21]:
preds=np.array(preds)
preds

array(['1', '1', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '0',
       '1', '1', '1', '1', '0', '1', '1', '1', '0', '0', '1', '1', '1',
       '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '0',
       '1', '1', '1', '0', '1', '0', '1', '1', '1', '0', '1', '1', '0',
       '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '1', '1', '1',
       '1', '1', '0', '0', '0', '1', '1', '0', '1', '1', '1', '1', '0',
       '0', '1', '1', '0', '1', '0', '0', '1', '1', '0', '1', '1', '1',
       '0', '0', '1', '1', '0', '1', '0', '1', '1', '1', '1', '1', '0',
       '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '0', '1',
       '1', '1', '0', '0', '1', '1', '1', '0', '1', '1', '1', '0', '1',
       '1', '1', '0', '1', '0', '0', '0', '0', '1', '1', '1', '1', '1',
       '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '0', '1

#### Confusion Matrix

In [22]:
pd.crosstab(y_test,preds,rownames= ["Actual Result"],
           colnames = ["Prediction Result"])

Prediction Result,0,1
Actual Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,87,0
1,9,300


#### Basic Evaluation Measures from the Confusion Matrix

In [23]:
#A confusion matrix is formed from the four outcomes produced as a result of binary classification.
#Four outcomes of classification
#True positive (TP): correct positive prediction
#False positive (FP): incorrect positive prediction
#True negative (TN): correct negative prediction
#False negative (FN): incorrect negative prediction
#
#                        Predicted
#                        Positive    Negative
#       Observed  Positive   TP          FN
#                 Negative   FP          TN

In [24]:
#Error rate
#Error rate (ERR) is calculated as the number of all incorrect predictions divided by the total number of the dataset.
#The best error rate is 0.0, whereas the worst is 1.0.
#Error rate: (FP + FN)/(P + N)
Err_rate = (9 + 0)/(87 + 309)
print("Error rate: {}".format(Err_rate) )

Error rate: 0.022727272727272728


In [25]:
#Accuracy
#Accuracy (ACC) is calculated as the number of all correct predictions divided by the total number of the dataset. 
#The best accuracy is 1.0, whereas the worst is 0.0. It can also be calculated by 1 – ERR.
#Accuracy : (TP + TN)/(P + N)
Acc = (87 + 300)/(87 + 309)
print("Accuracy : {}".format(Acc))

Accuracy : 0.9772727272727273


In [26]:
#Sensitivity (Recall or True positive rate)
#Sensitivity (SN) is calculated as the number of correct positive predictions divided by the total number of positives. It is also called recall (REC) or true positive rate (TPR). 
#The best sensitivity is 1.0, whereas the worst is 0.0
#Sensitivity = (TP)/(P)
Sens = 87/87
print("Sensitivity: {}".format(Sens))

Sensitivity: 1.0


In [27]:
#Specificity (True negative rate)
#Specificity (SP) is calculated as the number of correct negative predictions divided by the total number of negatives.
#It is also called true negative rate (TNR).
#The best specificity is 1.0, whereas the worst is 0.0.
#Specificity = (TN)/(N)
Spec = 300/309
print("Specificity: {}".format(Spec))

Specificity: 0.970873786407767


In [28]:
#Precision (Positive predictive value)
#Precision (PREC) is calculated as the number of correct positive predictions divided by the total number of positive predictions. 
#It is also called positive predictive value (PPV).
#The best precision is 1.0, whereas the worst is 0.0.
#Precision = (TP)/(TP + FP)
Precs = (87)/(96)
print("Precision: {}".format(Precs))

Precision: 0.90625


In [29]:
#False positive rate
#False positive rate (FPR) is calculated as the number of incorrect positive predictions divided by the total number of negatives. 
#The best false positive rate is 0.0 whereas the worst is 1.0.
#It can also be calculated as 1 – specificity.
#FPR = (FP)/(N)
FPR = 9/309
print("FPR : {}".format(FPR))

FPR : 0.02912621359223301
