# Gaussian Bayes Classifier

### Mastery Checkpoint 1  
CSC 466, Winter 2022  
Samay Nathani

In [1]:
from pathlib import Path
home = str(Path.home())

In [2]:
%load_ext autoreload
%autoreload 2

import Lab2_helper

Import our dataset  
I chose the [Palantir Stock Price Dataset from Kaggle](https://www.kaggle.com/kalilurrahman/palantir-stock-data-latest-and-updated)

In [3]:
import pandas as pd
import numpy as np
import math
from scipy.stats import norm
palantir = pd.read_csv(
    f"Palantir_stock_history.csv"
)

Do some preprocessing like Lab 2

In [4]:
features = ['Open', 'High', 'Low', 'Close']
palantir = palantir.loc[:,features]
palantir['Open'] =  round(palantir['Open'])
palantir['High'] = round(palantir['High'])
palantir['Low'] =  round(palantir['Low'])
palantir['Close'] = round(palantir['Close'])
palantir.head()

Unnamed: 0,Open,High,Low,Close
0,10.0,11.0,9.0,10.0
1,10.0,10.0,9.0,9.0
2,9.0,9.0,9.0,9.0
3,9.0,9.0,9.0,9.0
4,9.0,10.0,9.0,10.0


Defining the Gaussian Bayesian Classifier

In [13]:
class GaussianBayes:
    
    def compute_gaussian(self, mean, stddev, x):
        if stddev == 0:
            return 1
        exp = (((x-mean) / stddev)**2) * -0.5
        denom = ((stddev) * math.sqrt(2 * math.pi))
        return (math.e ** exp) / denom
    
    
    # Since we are assuming a normal distribution, this is where we use the mean, std, and variance to calculate the class conditional probability. 
    def specific_class_conditional(self,x,xv,y,yv):
        likelihoods = {}
        priors = {}
        denom = 0
        for uy in y.unique():
            count = y.value_counts()[uy]
            if count <= 1:
                likelihoods[uy] = 0
            else:
                likelihoods[uy] = self.compute_gaussian(x.loc[y==uy].mean(), x.loc[y==uy].std(), xv)
            
            priors[uy] = count / sum(y.value_counts())
            denom += (likelihoods[uy] * priors[uy])
        if denom != 0:
            return likelihoods[yv]*priors[yv] / denom
        else:
            return 0

    def class_conditional(self,X,y):
        probs = {}

        for eachy in y.unique():
            for col in X.columns:
                for eachx in X[col].unique():
                    probs[col + "=" + str(eachx) + "|" + y.name + "=" + str(eachy)] = self.specific_class_conditional(X[col], eachx, y, eachy)
        return probs

    def get_accuracy(self, Xtrain,ytrain,Xtest,ytest):
        probs = self.class_conditional(Xtrain, ytrain)
        priors = Lab2_helper.compute_priors(ytrain)
        ypred = []
        for idx in range(len(Xtest)):
            posterior = Lab2_helper.posteriors(probs, priors, Xtest.iloc[idx])
            k = max(posterior, key=posterior.get)
            pred = float((''.join(k.split("|")[0])).split("=")[-1])
            ypred.append(pred)
        ypred = np.array(ypred)
        correct = np.sum(ypred == ytest.to_numpy())
        accuracy = correct / len(ytest)
        return accuracy


In [14]:
gb = GaussianBayes()

In [15]:
X = palantir.drop("Close", axis=1)
y = palantir["Close"]

### Palantir Stock Dataset Accuracy

Accuracy of Gaussian-Bayes Classifier

In [23]:
np.random.seed(2)
Xtrain,ytrain,Xtest,ytest = Lab2_helper.train_test_split(X,y)
accuracy = gb.get_accuracy(Xtrain,ytrain,Xtest,ytest)
accuracy

0.475177304964539

Accuracy of Lab 2 classifier

In [28]:
Xtrain,ytrain,Xtest,ytest = Lab2_helper.train_test_split(X,y)
accuracy = Lab2_helper.exercise_6(Xtrain,ytrain,Xtest,ytest)
accuracy

0.46808510638297873

### Fruit Dataset Accuracy

Test our classifer and compare it to the Lab 2 classifier on the fruit dataset

In [29]:
fruits = pd.read_csv(f'{home}/csc-466-student/data/fruit_data_with_colours.csv')
fruit_features = ['fruit_label', 'mass', 'width', 'height', 'color_score']
fruits = fruits.loc[:, fruit_features]
Xf = fruits.drop("fruit_label", axis=1)
yf = fruits["fruit_label"]

Accuracy of Gaussian Bayes classifier on fruit dataset

In [30]:
Xftrain,yftrain,Xftest,yftest = Lab2_helper.train_test_split(Xf,yf)
faccuracy = gb.get_accuracy(Xftrain,yftrain,Xftest,yftest)
faccuracy

0.3448275862068966

Accuracy of Lab 2 classifier

In [31]:
Xftrain,yftrain,Xftest,yftest = Lab2_helper.train_test_split(Xf,yf)
faccuracy = Lab2_helper.exercise_6(Xftrain,yftrain,Xftest,yftest)
faccuracy

0.3448275862068966

### Permutation Feature Importance

Feature importance on Palantir Stock dataset with Lab 2 classifier

In [35]:
Xtrain,ytrain,Xtest,ytest = Lab2_helper.train_test_split(X[:len(X) // 10], y[:len(y) // 10])
Lab2_helper.test_based_feature_importance(Xtrain,ytrain,Xtest,ytest)

{'Open': 0.02857142857142857,
 'High': 0.22142857142857145,
 'Low': 0.06428571428571431}

In [36]:
Lab2_helper.train_based_feature_importance(Xtrain,ytrain,Xtest,ytest)

{'Open': -0.0392857142857143,
 'High': 0.0642857142857143,
 'Low': 0.15714285714285717}

Feature importance on Palantir Stock data with Gaussian Bayes classifier

feature importance on fruit dataset with Lab 2 classifier

In [None]:
np.random.seed(1)
Xftrain,yftrain,Xftest,yftest = Lab2_helper.train_test_split(Xf[:len(Xf) // 2],yf[:len(yf) // 2])
Lab2_helper.test_based_feature_importance(Xftrain,yftrain,Xftest,yftest)

In [None]:
Lab2_helper.train_based_feature_importance(Xftrain,yftrain,Xftest,yftest)