In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import scale
import pandas as pd
from numpy import *
from scipy.stats import norm
from multiprocessing import Pool

In [2]:
random.seed(123456)
df = pd.read_csv('creditcard.csv')
y = df["Class"].values
df2 = df.drop("Class", axis=1).values
df2= scale( df2, axis=0, with_mean=True, with_std=True, copy=True )

In [3]:
def Naive_Bayes(xtrain, xtest, ytrain, ytest):
    clf = GaussianNB()
    y_pred = clf.fit(xtrain, ytrain).predict(xtest)
    return(f1_score(ytest, y_pred,average='binary'))

In [4]:
def Bayesian_Probit(xtrain, xtest, ytrain, ytest):  
    Y =ytrain
    K = len(xtrain[0,:]) +1
    N = len(Y)
    Z=Y
    X = ones((N,K))
    X[:,1:K] = xtrain
    # inverse of prior variance for beta
    nrep = 10000
    burnin = 1000
    # inverse of prior variance for beta
    B0 = identity(K)*.01
    xi = mean(X, axis = 0)
    XX = matmul(transpose(X),X)
    XX_inv = linalg.inv(XX)
    pos = Y>0
    neg = Y<=0
    no_pos = sum(pos)
    no_neg = sum(neg)
    tau = 1
    sig = linalg.inv(XX+B0)
    beta = zeros((nrep,K))

    # Gibb sampling
    for i in range(1,nrep):
        b_mean = matmul(sig, tau*matmul(transpose(X),Z))
        beta[i,:] = random.multivariate_normal(b_mean, sig)
        mu = matmul(X,beta[i,:])
        # A and B are just some operations to simplify the operations pf Z
        A = ones(no_pos) - norm.cdf(-mu[pos])
        B = random.rand(no_pos)
        Z[pos] = norm.ppf(multiply(A,B) +norm.cdf(-mu[pos])) +mu[pos]
        A = random.rand(no_neg)
        B = norm.cdf(-mu[neg])
        Z[neg] = norm.ppf(multiply(A,B))+mu[neg]
        
    b = mean(beta[burnin:nrep,:], axis = 0)
    bayes_xtest = ones((len(ytest),K))
    bayes_xtest[:,1:K] = xtest
    z_pred = matmul(bayes_xtest,b)
    y_pred2 = (z_pred>0).astype(float)
    return(f1_score(ytest, y_pred2,average='binary'))

In [5]:
# I assigned each core 10 samples
def worker(random_state):
    for i in range(10):
        xtrain, xtest, ytrain, ytest = train_test_split(df2, y, 
                                                test_size = 0.2, random_state=(random_state+i))
        return(Naive_Bayes(xtrain, xtest, ytrain, ytest), Bayesian_Probit(xtrain, xtest, ytrain, ytest))

In [6]:
if __name__ == '__main__':
    p = Pool(12)
    result = []
    random_state = random.randint(100, size=10)
    result.append(p.map(worker, random_state))
    print("F1 scores for Naive Bayes and Bayesian Probit are ", mean(result, axis = 1), " respectively.")

F1 scores for Naive Bayes and Bayesian Probit are  [[ 0.12277299  0.55810718]]  respectively.
