# Gaussian Bayesian Classifier

### Mastery Checkpoint 1  
CSC 466, Winter 2022  
Samay Nathani

In [2]:
from pathlib import Path
home = str(Path.home())

In [97]:
%load_ext autoreload
%autoreload 2

import Lab2_helper

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Import our dataset  
I chose the [Airbnb Listing Dataset from Kaggle](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/version/3)

In [115]:
import pandas as pd
import numpy as np
import math
from scipy.stats import norm
airbnb = pd.read_csv(
    f"AB_NYC_2019.csv"
)

Do some preprocessing like Lab 2

In [125]:
features = ['price', 'room_type', 'neighbourhood_group']
airbnb = airbnb.loc[:,features]
airbnb['price'] = (airbnb['price'] // 10) * 10
display(airbnb.head())

Unnamed: 0,price,room_type,neighbourhood_group
0,140,Private room,Brooklyn
1,220,Entire home/apt,Manhattan
2,150,Private room,Manhattan
3,80,Entire home/apt,Brooklyn
4,80,Entire home/apt,Manhattan


Defining the Gaussian Bayesian Classifier

In [166]:
class GaussianBayes:
    
    def compute_priors(self, y):
        value_counts = y.value_counts().sort_index()
        total_values = len(y)
        indexes = [y.name+"="+str(v) for v in value_counts.index.tolist()]
        priors = dict(zip(indexes, value_counts / total_values))
        return priors
    
    def compute_normal_distribution(self, mean, stddev, x):
        exp = (((x-mean) / stddev)**2) * -0.5
        denom = ((stddev) * math.sqrt(2 * math.pi))
        return (math.e ** exp) / denom
    
    
    # Since we are assuming a normal distribution, this is where we use the mean, std, and variance to calculate the class conditional probability. 
    def specific_class_conditional(self,x,xv,y,yv):
        # todo: replace binning with normal distribution calculations    

        likelihoods = {}
        priors = {}
    
        for ux in x.unique():

            y_vals = y.loc[x==ux]            
            likelihoods[ux] = self.compute_normal_distribution(y_vals.mean(), y_vals.std(), yv)
            priors[ux] =len(y_vals) / len(x)

        denom = 0
        for k in priors.keys():
            denom += (priors[k]*likelihoods[k])

        classcond = likelihoods[xv]*priors[xv] / denom
        return classcond


    def class_conditional(self,X,y):
        probs = {}

        for eachy in y.unique():
            for col in X.columns:
                for eachx in X[col].unique():
                    probs[col + "=" + str(eachx) + "|" + y.name + "=" + str(eachy)] = self.specific_class_conditional(X[col], eachx, y, eachy)
        return probs

    def posteriors(self,probs,priors,x):
        post_probs = {}
        denom = 0
        for k in priors.keys():
            numerator = 1
            postkey = ""
            
            for idx in x.index:
                postkey += idx + "=" + str(x[idx]) + ","
                probkey = idx + "=" + str(x[idx]) + "|" + k
                
                if probkey not in probs:
                    numerator *= 0
                else:
                    numerator *= probs[probkey]
            
            numerator*=priors[k]
            post_probs[k + "|" + postkey[:-1]] = numerator
            denom+=numerator
            
        for k, v in post_probs.items():
            if denom != 0:
                post_probs[k] = v / denom
            else:
                post_probs[k] = 1 / len(list(priors.keys()))
        
        return post_probs
    
    
    def train_test_split(self, X,y,test_frac=0.5):
        inxs = list(range(len(y)))
        np.random.shuffle(inxs)
        X = X.iloc[inxs,:]
        y = y.iloc[inxs]
        xsplit = round(len(X)*test_frac)
        ysplit = round(len(y)*test_frac)
        Xtrain,ytrain,Xtest,ytest = X.iloc[:xsplit, :], y.iloc[:ysplit], X.iloc[xsplit:, :], y.iloc[ysplit:]
        return Xtrain,ytrain,Xtest,ytest

    def get_accuracy(self, Xtrain,ytrain,Xtest,ytest):
        probs = self.class_conditional(Xtrain, ytrain)
        priors = self.compute_priors(ytrain)
        ypred = []
        for idx in range(len(Xtest)):
            posterior = self.posteriors(probs, priors, Xtest.iloc[idx])
            k = max(posterior, key=posterior.get)
            pred = float((''.join(k.split("|")[0])).split("=")[-1])
            ypred.append(pred)
        ypred = np.array(ypred)
        correct = np.sum(ypred == ytest.to_numpy())
        accuracy = correct / len(ytest)
        return accuracy


In [167]:
gb = GaussianBayes()

In [175]:
X = airbnb.drop("price", axis=1)[:len(airbnb) // 10]
y = airbnb["price"][:len(airbnb) // 10]

In [176]:
#fruits = pd.read_csv(f'{home}/csc-466-student/data/fruit_data_with_colours.csv')
#Xf = fruits.drop("width", axis=1)
#yf = fruits["width"]
#gb.class_conditional(Xf, yf)
#Xtrain,ytrain,Xtest,ytest = gb.train_test_split(Xf,yf)
#accuracy = gb.get_accuracy(Xtrain,ytrain,Xtest,ytest)
#accuracy

Accuracy of Gaussian-Bayes Classifier

In [177]:
np.random.seed(2)
Xtrain,ytrain,Xtest,ytest = gb.train_test_split(X,y)
accuracy = gb.get_accuracy(Xtrain,ytrain,Xtest,ytest)
accuracy

0.09038854805725971

Accuracy of Lab 2 classifier

In [153]:
np.random.seed(0)
Xtrain,ytrain,Xtest,ytest = Lab2_helper.train_test_split(X,y)
accuracy = Lab2_helper.exercise_6(Xtrain,ytrain,Xtest,ytest)
accuracy

0.0