In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import scale
import pandas as pd
from numpy import *
from scipy.stats import norm

In [2]:
random.seed(123456)

df = pd.read_csv('creditcard.csv')
y = df["Class"].values
df2 = df.drop("Class", axis=1).values
df2 = scale( df2, axis=0, with_mean=True, with_std=True, copy=True )
xtrain, xtest, ytrain, ytest = train_test_split(df2, y, test_size = 0.2, random_state=1)

In [3]:
clf = GaussianNB()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
print("Correct classification", sum(y_pred==ytest)/len(ytest)*100, "%")

Correct classification 97.8266212563 %


In [4]:
print("F1 score: ", f1_score(ytest, y_pred,average='binary'))

F1 score:  0.0950292397661


In [5]:
# with tau = 1
Y =ytrain
K = len(xtrain[0,:]) +1
N = len(Y)
Z=Y
X = ones((N,K))
X[:,1:K] = xtrain
# inverse of prior variance for beta
nrep = 10000
burnin = 1000
# inverse of prior variance for beta
B0 = identity(K)*.01
xi = mean(X, axis = 0)
XX = matmul(transpose(X),X)
XX_inv = linalg.inv(XX)
pos = Y>0
neg = Y<=0
no_pos = sum(pos)
no_neg = sum(neg)
tau = 1
sig = linalg.inv(XX+B0)
beta = zeros((nrep,K))

# Gibb sampling
for i in range(1,nrep):
    b_mean = matmul(sig, tau*matmul(transpose(X),Z))
    beta[i,:] = random.multivariate_normal(b_mean, sig)
    mu = matmul(X,beta[i,:])
    # A and B are just some operations to simplify the operations pf Z
    A = ones(no_pos) - norm.cdf(-mu[pos])
    B = random.rand(no_pos)
    Z[pos] = norm.ppf(multiply(A,B) +norm.cdf(-mu[pos])) +mu[pos]
    A = random.rand(no_neg)
    B = norm.cdf(-mu[neg])
    Z[neg] = norm.ppf(multiply(A,B))+mu[neg]

In [6]:
b = mean(beta[burnin:nrep,:], axis = 0)
bayes_xtest = ones((len(ytest),K))
bayes_xtest[:,1:K] = xtest
z_pred = matmul(bayes_xtest,b)
y_pred2 = (z_pred>0).astype(float)
print("Correct classification", sum(y_pred2== ytest)/len(ytest)*100, "%")

Correct classification 99.8911555072 %


In [7]:
print("F1 score: ", f1_score(ytest, y_pred2,average='binary'))

F1 score:  0.515625
