# Gaussian Bayes Classifier

### Mastery Checkpoint 1  
CSC 466, Winter 2022  
Samay Nathani

In [11]:
from pathlib import Path
home = str(Path.home())

In [12]:
%load_ext autoreload
%autoreload 2

import Lab2_helper

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Import our dataset  
I chose the [Palantir Stock Price Dataset from Kaggle](https://www.kaggle.com/kalilurrahman/palantir-stock-data-latest-and-updated)

In [13]:
import pandas as pd
import numpy as np
import math
from scipy.stats import norm
palantir = pd.read_csv(
    f"Palantir_stock_history.csv"
)

Do some preprocessing like Lab 2

In [44]:
features = ['Open', 'High', 'Low', 'Close']
palantir = palantir.loc[:,features]
palantir['Open'] =  round(palantir['Open'])
palantir['High'] = round(palantir['High'])
palantir['Low'] =  round(palantir['Low'])
palantir['Close'] = round(palantir['Close'])
palantir.head()

Unnamed: 0,Open,High,Low,Close
0,10.0,11.0,9.0,10.0
1,10.0,10.0,9.0,9.0
2,9.0,9.0,9.0,9.0
3,9.0,9.0,9.0,9.0
4,9.0,10.0,9.0,10.0


Defining the Gaussian Bayesian Classifier

In [45]:
class GaussianBayes:
    
    def compute_priors(self, y):
        value_counts = y.value_counts().sort_index()
        total_values = len(y)
        indexes = [y.name+"="+str(v) for v in value_counts.index.tolist()]
        priors = dict(zip(indexes, value_counts / total_values))
        return priors
    
    def compute_gaussian(self, mean, stddev, x):
        if stddev == 0:
            return 1
        exp = (((x-mean) / stddev)**2) * -0.5
        denom = ((stddev) * math.sqrt(2 * math.pi))
        return (math.e ** exp) / denom
    
    
    # Since we are assuming a normal distribution, this is where we use the mean, std, and variance to calculate the class conditional probability. 
    def specific_class_conditional(self,x,xv,y,yv):

        likelihoods = {}
        priors = {}
        denom = 0
        
        for uy in y.unique():
            count = y.value_counts()[uy]
            if count <= 1:
                likelihoods[uy] = 0
            else:
                likelihoods[uy] = self.compute_gaussian(x.loc[y==uy].mean(), x.loc[y==uy].std(), xv)
            
            priors[uy] = count / sum(y.value_counts())
            denom += (likelihoods[uy] * priors[uy])
        if denom != 0:
            return likelihoods[yv]*priors[yv] / denom
        else:
            return 0


    def class_conditional(self,X,y):
        probs = {}

        for eachy in y.unique():
            for col in X.columns:
                for eachx in X[col].unique():
                    probs[col + "=" + str(eachx) + "|" + y.name + "=" + str(eachy)] = self.specific_class_conditional(X[col], eachx, y, eachy)
        return probs

    def posteriors(self, probs,priors,x):
        post_probs = {}
        denom = 0
        for k in priors.keys():
            numerator = 1
            postkey = ""
            for idx in x.index:
                postkey += idx + "=" + str(x[idx]) + ","
                probkey = idx + "=" + str(x[idx]) + "|" + k
                if probkey not in probs:
                    numerator *= 0
                else:
                    numerator *= probs[probkey]
            numerator*=priors[k]
            post_probs[k + "|" + postkey[:-1]] = numerator
            denom+=numerator
        for k, v in post_probs.items():
            if denom != 0:
                post_probs[k] = v / denom
            else:
                post_probs[k] = 1 / len(list(priors.keys()))
        return post_probs
    
    
    def train_test_split(self, X,y,test_frac=0.5):
        inxs = list(range(len(y)))
        np.random.shuffle(inxs)
        X = X.iloc[inxs,:]
        y = y.iloc[inxs]
        xsplit = round(len(X)*test_frac)
        ysplit = round(len(y)*test_frac)
        Xtrain,ytrain,Xtest,ytest = X.iloc[:xsplit, :], y.iloc[:ysplit], X.iloc[xsplit:, :], y.iloc[ysplit:]
        return Xtrain,ytrain,Xtest,ytest

    def get_accuracy(self, Xtrain,ytrain,Xtest,ytest):
        probs = self.class_conditional(Xtrain, ytrain)
        priors = self.compute_priors(ytrain)
        ypred = []
        for idx in range(len(Xtest)):
            posterior = self.posteriors(probs, priors, Xtest.iloc[idx])
            k = max(posterior, key=posterior.get)
            pred = float((''.join(k.split("|")[0])).split("=")[-1])
            ypred.append(pred)
        ypred = np.array(ypred)
        correct = np.sum(ypred == ytest.to_numpy())
        accuracy = correct / len(ytest)
        return accuracy


In [46]:
gb = GaussianBayes()

In [47]:
X = palantir.drop("Close", axis=1)
y = palantir["Close"]

Accuracy of Gaussian-Bayes Classifier

In [49]:
np.random.seed(2)
Xtrain,ytrain,Xtest,ytest = gb.train_test_split(X,y)
accuracy = gb.get_accuracy(Xtrain,ytrain,Xtest,ytest)
accuracy

0.475177304964539

Accuracy of Lab 2 classifier

In [None]:
np.random.seed(2)
Xtrain,ytrain,Xtest,ytest = Lab2_helper.train_test_split(X,y)
accuracy = Lab2_helper.exercise_6(Xtrain,ytrain,Xtest,ytest)
accuracy