In [1]:
# imports
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io.arff import loadarff
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from naivebayes import NaiveBayes

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

%matplotlib inline

In [2]:
# Load learn data
raw_data = loadarff("data/dataset_194_eucalyptus.arff");
df = pd.DataFrame(raw_data[0])

In [3]:
df.head()

Unnamed: 0,Abbrev,Rep,Locality,Map_Ref,Latitude,Altitude,Rainfall,Frosts,Year,Sp,PMCno,DBH,Ht,Surv,Vig,Ins_res,Stem_Fm,Crown_Fm,Brnch_Fm,Utility
0,b'Cra',1.0,b'Central_Hawkes_Bay',b'N135_382/137',b'39__38',100.0,850.0,-2.0,1980.0,b'co',1520.0,18.45,9.96,40.0,4.0,3.0,3.5,4.0,3.5,b'good'
1,b'Cra',1.0,b'Central_Hawkes_Bay',b'N135_382/137',b'39__38',100.0,850.0,-2.0,1980.0,b'fr',1487.0,13.15,9.65,90.0,4.5,4.0,3.5,3.5,3.0,b'best'
2,b'Cra',1.0,b'Central_Hawkes_Bay',b'N135_382/137',b'39__38',100.0,850.0,-2.0,1980.0,b'ma',1362.0,10.32,6.5,50.0,2.3,2.5,3.0,3.5,3.0,b'low'
3,b'Cra',1.0,b'Central_Hawkes_Bay',b'N135_382/137',b'39__38',100.0,850.0,-2.0,1980.0,b'nd',1596.0,14.8,9.48,70.0,3.7,3.0,3.3,4.0,3.5,b'good'
4,b'Cra',1.0,b'Central_Hawkes_Bay',b'N135_382/137',b'39__38',100.0,850.0,-2.0,1980.0,b'ni',2088.0,14.5,10.78,90.0,4.0,2.7,3.3,3.0,3.0,b'good'


In [4]:
df = df.dropna()

In [5]:
df.shape

(641, 20)

## Custom Implementation

In [11]:
train, test = train_test_split(df, test_size=0.3, random_state=42, shuffle=True)
X_test = test.drop(['Utility'], axis=1)
y_test = list(test['Utility'])

In [12]:
nb = NaiveBayes()
nb.fit(train, 'Utility')

{'Abbrev': {b'Wak': {'general': 0.11607142857142858, b'low': 0.024553571428571428, b'good': 0.022321428571428572, b'average': 0.015625, b'none': 0.03794642857142857, b'best': 0.015625}, b'Cra': {'general': 0.049107142857142856, b'low': 0.004464285714285714, b'good': 0.017857142857142856, b'average': 0.015625, b'none': 0.0, b'best': 0.011160714285714286}, b'Puk': {'general': 0.13839285714285715, b'low': 0.017857142857142856, b'good': 0.03794642857142857, b'average': 0.03571428571428571, b'none': 0.03794642857142857, b'best': 0.008928571428571428}, b'K83': {'general': 0.08928571428571429, b'low': 0.017857142857142856, b'good': 0.015625, b'average': 0.029017857142857144, b'none': 0.020089285714285716, b'best': 0.006696428571428571}, b'WSp': {'general': 0.08035714285714286, b'low': 0.002232142857142857, b'good': 0.04017857142857143, b'average': 0.024553571428571428, b'none': 0.004464285714285714, b'best': 0.008928571428571428}, b'K81a': {'general': 0.05357142857142857, b'low': 0.0066964285

In [13]:
predictions = nb.predict(X_test)

In [14]:
accuracy_score(y_test, predictions)

0.5492227979274611

## SKLearn

In [46]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder  

In [47]:
le = LabelEncoder()
nominal_features = ["Abbrev", "Locality", "Map_Ref", "Latitude", "Sp"]

X = df.drop(columns=['Utility'])
y = df['Utility']

for feature in nominal_features:
    X[feature] = le.fit_transform(X[feature])

y = le.fit_transform (y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [48]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [49]:
accuracy_score(y_test, y_pred)

0.5595854922279793