In [1]:
import pandas as pd
import numpy as np

In [2]:
with open("column_names.txt") as f:
    names = f.read()

In [3]:
names = names.split(',')

In [4]:
names

['label',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'bruises',
 'odor',
 'gill-attachment',
 'gill-spacing',
 'gill-size',
 'gill-color',
 'stalk-shape',
 'stalk-root',
 'stalk-surface-above-ring',
 'stalk-surface-below-ring',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'veil-type',
 'veil-color',
 'ring-number',
 'ring-type',
 'spore-print-color',
 'population',
 'habitat']

In [5]:
df = pd.read_csv('agaricus-lepiota.data', header=None, na_values='?')

In [6]:
df.columns = names

In [7]:
df = df.dropna()

In [8]:
print(len(names))

23


In [9]:
for col in names:
    print(col, df[col].unique())

label ['p' 'e']
cap-shape ['x' 'b' 's' 'f' 'k' 'c']
cap-surface ['s' 'y' 'f' 'g']
cap-color ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'c']
bruises ['t' 'f']
odor ['p' 'a' 'l' 'n' 'f' 'c' 'm']
gill-attachment ['f' 'a']
gill-spacing ['c' 'w']
gill-size ['n' 'b']
gill-color ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'r' 'y']
stalk-shape ['e' 't']
stalk-root ['e' 'c' 'b' 'r']
stalk-surface-above-ring ['s' 'f' 'k' 'y']
stalk-surface-below-ring ['s' 'f' 'y' 'k']
stalk-color-above-ring ['w' 'g' 'p' 'n' 'b' 'c' 'y']
stalk-color-below-ring ['w' 'p' 'g' 'b' 'n' 'c' 'y']
veil-type ['p']
veil-color ['w' 'y']
ring-number ['o' 't' 'n']
ring-type ['p' 'e' 'l' 'n']
spore-print-color ['k' 'n' 'u' 'h' 'r' 'w']
population ['s' 'n' 'a' 'v' 'y' 'c']
habitat ['u' 'g' 'm' 'd' 'p' 'l']


In [10]:
df = df.drop('veil-type', axis=1)
names.remove('veil-type')

In [11]:
X_df = df[names[1:]]
y_df = df[names[:1]]

In [12]:
X = X_df.values
y = y_df.values

In [13]:
X.shape

(5644, 21)

In [14]:
from naive_bayes_categorical import NBCategorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Naive Bayes - one model

In [16]:
model = NBCategorical(smoothing_parameter=0.0)

In [17]:
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)

In [19]:
accuracy_score(y_test, y_pred)

1.0

# Random Bayes - many models

In [20]:
from random_bayes import RandomBayes

In [21]:
model_rb = RandomBayes(n_submodels=30, n_features=5, smooth=1.0)

In [22]:
model_rb.fit(X_train, y_train)

In [23]:
y_pred_rb = model_rb.predict(X_test)



In [24]:
accuracy_score(y_test, y_pred_rb)

0.9199149539333806

In [25]:
accuracy_score(y_test, y_pred)

1.0


# Experiments

In [26]:
import itertools

n_submodels_vals = [5, 10, 20, 50, 100]
n_features_vals = [5, 10, 15, 20]
smooth_vals = [0.0, 0.5, 1.0, 5.0]
bootstrap_vals = [True, False]

combinations = list(itertools.product(n_submodels_vals, n_features_vals, smooth_vals, bootstrap_vals))
num_iterations = 10

In [27]:
# import multiprocessing as mp
# from Process import process

# pool = mp.Pool(8)
# results = pool.map(process, combinations)

In [28]:
X_train.shape

(4233, 21)

In [29]:
X_test.shape

(1411, 21)