# Gaussian Bayes Classifier

### Mastery Checkpoint 1  
CSC 466, Winter 2022  
Samay Nathani

In [1]:
from pathlib import Path
home = str(Path.home())

In [2]:
%load_ext autoreload
%autoreload 2

import Lab2_helper
import gaussian_helper

Import our dataset  
I chose the [Palantir Stock Price Dataset from Kaggle](https://www.kaggle.com/kalilurrahman/palantir-stock-data-latest-and-updated)

In [3]:
import pandas as pd
import numpy as np
import math
from scipy.stats import norm
palantir = pd.read_csv(
    f"Palantir_stock_history.csv"
)

Do some preprocessing like Lab 2

In [4]:
features = ['Open', 'High', 'Low', 'Close']
palantir = palantir.loc[:,features]
palantir['Open'] =  round(palantir['Open'])
palantir['High'] = round(palantir['High'])
palantir['Low'] =  round(palantir['Low'])
palantir['Close'] = round(palantir['Close'])
palantir.head()

Unnamed: 0,Open,High,Low,Close
0,10.0,11.0,9.0,10.0
1,10.0,10.0,9.0,9.0
2,9.0,9.0,9.0,9.0
3,9.0,9.0,9.0,9.0
4,9.0,10.0,9.0,10.0


Defining the Gaussian Bayesian Classifier

In [5]:
def compute_gaussian(mean, stddev, x):
    if stddev == 0:
        return 1
    exp = (((x-mean) / stddev)**2) * -0.5
    denom = ((stddev) * math.sqrt(2 * math.pi))
    return (math.e ** exp) / denom

def specific_class_conditional(x,xv,y,yv):
    likelihoods = {}
    priors = {}
    denom = 0
    for uy in y.unique():
        count = y.value_counts()[uy]
        if count <= 1:
            likelihoods[uy] = 0
        else:
            likelihoods[uy] = compute_gaussian(x.loc[y==uy].mean(), x.loc[y==uy].std(), xv)
        priors[uy] = count / sum(y.value_counts())
        denom += (likelihoods[uy] * priors[uy])
    if denom != 0:
        return likelihoods[yv]*priors[yv] / denom
    else:
        return 0

def class_conditional(X,y):
    probs = {}
    for eachy in y.unique():
        for col in X.columns:
            for eachx in X[col].unique():
                probs[col + "=" + str(eachx) + "|" + y.name + "=" + str(eachy)] = specific_class_conditional(X[col], eachx, y, eachy)
    return probs

In [6]:
X = palantir.drop("Close", axis=1)
y = palantir["Close"]

### Palantir Stock Dataset Accuracy

Accuracy of Gaussian-Bayes Classifier

In [7]:
np.random.seed(2)
Xtrain,ytrain,Xtest,ytest = Lab2_helper.train_test_split(X,y)
accuracy = gaussian_helper.get_accuracy(Xtrain,ytrain,Xtest,ytest)
accuracy

0.524822695035461

Accuracy of Lab 2 classifier

In [8]:
Xtrain,ytrain,Xtest,ytest = Lab2_helper.train_test_split(X,y)
accuracy = Lab2_helper.exercise_6(Xtrain,ytrain,Xtest,ytest)
accuracy

0.45390070921985815

### Fruit Dataset Accuracy

Test our classifer and compare it to the Lab 2 classifier on the fruit dataset

In [9]:
fruits = pd.read_csv(f'{home}/csc-466-student/data/fruit_data_with_colours.csv')
fruit_features = ['fruit_label', 'mass', 'width', 'height', 'color_score']
fruits = fruits.loc[:, fruit_features]
Xf = fruits.drop("fruit_label", axis=1)
yf = fruits["fruit_label"]

Accuracy of Gaussian Bayes classifier on fruit dataset

In [10]:
Xftrain,yftrain,Xftest,yftest = Lab2_helper.train_test_split(Xf,yf)
faccuracy = gaussian_helper.get_accuracy(Xftrain,yftrain,Xftest,yftest)
faccuracy

0.3793103448275862

Accuracy of Lab 2 classifier

In [11]:
Xftrain,yftrain,Xftest,yftest = Lab2_helper.train_test_split(Xf,yf)
faccuracy = Lab2_helper.exercise_6(Xftrain,yftrain,Xftest,yftest)
faccuracy

0.3103448275862069

### Permutation Feature Importance

Feature importance on Palantir Stock dataset with Lab 2 classifier

In [12]:
Xtrain,ytrain,Xtest,ytest = Lab2_helper.train_test_split(X[:len(X) // 10], y[:len(y) // 10])
Lab2_helper.test_based_feature_importance(Xtrain,ytrain,Xtest,ytest)

{'Open': -0.014285714285714258,
 'High': 0.30000000000000004,
 'Low': 0.1071428571428572}

In [13]:
Lab2_helper.train_based_feature_importance(Xtrain,ytrain,Xtest,ytest)

{'Open': 0.028571428571428602,
 'High': -0.039285714285714264,
 'Low': 0.16071428571428575}

Feature importance on Palantir Stock data with Gaussian Bayes classifier

In [14]:
gaussian_helper.test_based_feature_importance(Xtrain,ytrain,Xtest,ytest)

{'Open': -0.014285714285714268,
 'High': 0.3285714285714286,
 'Low': 0.050000000000000044}

In [15]:
gaussian_helper.train_based_feature_importance(Xtrain,ytrain,Xtest,ytest)

{'Open': 0.0035714285714286004,
 'High': -0.01785714285714283,
 'Low': 0.1535714285714286}

feature importance on fruit dataset with Lab 2 classifier

In [16]:
np.random.seed(1)
Xftrain,yftrain,Xftest,yftest = Lab2_helper.train_test_split(Xf[:len(Xf) // 2],yf[:len(yf) // 2])
Lab2_helper.test_based_feature_importance(Xftrain,yftrain,Xftest,yftest)

{'mass': 0.0, 'width': 0.0, 'height': 0.0, 'color_score': 0.0}

In [17]:
Lab2_helper.train_based_feature_importance(Xftrain,yftrain,Xftest,yftest)

{'mass': 0.0, 'width': 0.0, 'height': 0.0, 'color_score': 0.0}

Feature importance on Fruit dataset with Gaussian Bayes classifier

In [18]:
Xftrain,yftrain,Xftest,yftest = Lab2_helper.train_test_split(Xf[:len(Xf) // 2],yf[:len(yf) // 2])
gaussian_helper.test_based_feature_importance(Xftrain,yftrain,Xftest,yftest)

{'mass': 0.0, 'width': 0.0, 'height': 0.0, 'color_score': 0.0}

In [19]:
gaussian_helper.train_based_feature_importance(Xftrain,yftrain,Xftest,yftest)

{'mass': 0.0, 'width': 0.0, 'height': 0.0, 'color_score': 0.0}

### Ethical Analysis on another dataset

I chose the [AirBnb NYC Listing](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data) dataset from Kaggle

In [13]:
airbnb = pd.read_csv(
    f"AB_NYC_2019.csv"
)

In [15]:
features = ['neighbourhood_group', 'room_type', 'price']
airbnb = airbnb.loc[:,features]

Label encoding our features

In [22]:
airbnb['neighbourhood_group'] = airbnb['neighbourhood_group'].astype('category')
airbnb['room_type'] = airbnb['room_type'].astype('category')
#airbnb['latitude'] = round(airbnb['latitude'], 1)
airbnb['price'] = (airbnb['price'] // 50) * 50

In [23]:
airbnb['neighborhood_cat'] = airbnb['neighbourhood_group'].cat.codes
airbnb['room_cat'] = airbnb['room_type'].cat.codes

In [24]:
X = airbnb.loc[:, ['neighborhood_cat', 'room_cat']]
y = airbnb['price']

In [26]:
#class_conditional(X[:len(X) // 10],y[:len(y) // 10])

In [28]:
Xtrain,ytrain,Xtest,ytest = Lab2_helper.train_test_split(X[:len(X)],y[:len(y)])
accuracy = gaussian_helper.get_accuracy(Xtrain,ytrain,Xtest,ytest)
accuracy

0.10369370474904896

In [None]:
X_facebook, y_facebook = facebook.drop("Rating")