In [1]:
import pandas as pd
import numpy as np

In [2]:
with open("column_names.txt") as f:
    names = f.read()

In [3]:
names = names.split(',')

In [4]:
names

['label',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'bruises',
 'odor',
 'gill-attachment',
 'gill-spacing',
 'gill-size',
 'gill-color',
 'stalk-shape',
 'stalk-root',
 'stalk-surface-above-ring',
 'stalk-surface-below-ring',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'veil-type',
 'veil-color',
 'ring-number',
 'ring-type',
 'spore-print-color',
 'population',
 'habitat']

In [5]:
df = pd.read_csv('agaricus-lepiota.data', header=None, na_values='?')

In [6]:
df.columns = names

In [7]:
df = df.dropna()

In [8]:
for col in names:
    print(col, df[col].unique())

label ['p' 'e']
cap-shape ['x' 'b' 's' 'f' 'k' 'c']
cap-surface ['s' 'y' 'f' 'g']
cap-color ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'c']
bruises ['t' 'f']
odor ['p' 'a' 'l' 'n' 'f' 'c' 'm']
gill-attachment ['f' 'a']
gill-spacing ['c' 'w']
gill-size ['n' 'b']
gill-color ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'r' 'y']
stalk-shape ['e' 't']
stalk-root ['e' 'c' 'b' 'r']
stalk-surface-above-ring ['s' 'f' 'k' 'y']
stalk-surface-below-ring ['s' 'f' 'y' 'k']
stalk-color-above-ring ['w' 'g' 'p' 'n' 'b' 'c' 'y']
stalk-color-below-ring ['w' 'p' 'g' 'b' 'n' 'c' 'y']
veil-type ['p']
veil-color ['w' 'y']
ring-number ['o' 't' 'n']
ring-type ['p' 'e' 'l' 'n']
spore-print-color ['k' 'n' 'u' 'h' 'r' 'w']
population ['s' 'n' 'a' 'v' 'y' 'c']
habitat ['u' 'g' 'm' 'd' 'p' 'l']


In [9]:
df = df.drop('veil-type', axis=1)
names.remove('veil-type')

In [10]:
X_df = df[names[1:]]
y_df = df[names[:1]]

In [11]:
X = X_df.values
y = y_df.values

In [12]:
from naive_bayes_categorical import NBCategorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Naive Bayes - one model

In [14]:
model = NBCategorical(smoothing_parameter=0.0)

In [15]:
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)

In [17]:
accuracy_score(y_test, y_pred)

0.9964564138908576

# Random Bayes - many models

In [18]:
from random_bayes import RandomBayes

In [19]:
model_rb = RandomBayes(n_submodels=30, n_features=5, smooth=1.0)

In [20]:
model_rb.fit(X_train, y_train)

In [21]:
y_pred_rb = model_rb.predict(X_test)



In [22]:
accuracy_score(y_test, y_pred_rb)

0.9475549255846917

In [23]:
accuracy_score(y_test, y_pred)

0.9964564138908576


# Experiments

In [36]:
import itertools

n_submodels_vals = [5, 10, 20, 50, 100]
n_features_vals = [5, 10, 15, 20]
smooth_vals = [0.0, 0.5, 1.0, 5.0]
bootstrap_vals = [True, False]

combinations = list(itertools.product(n_submodels_vals, n_features_vals, smooth_vals, bootstrap_vals))
num_iterations = 10

In [35]:
random_bayes_results = {}
single_bayes_results = {}

In [46]:
for c in combinations:
    rb_list = []
    sb_list = []
    
    for _ in range(num_iterations):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
        
        model_rb = RandomBayes(n_submodels=c[0], n_features=c[1], smooth=c[2], bootstrap=c[3])
        model_rb.fit(X_train, y_train)
        rb_list.append(accuracy_score(y_test, model_rb.predict(X_test)))
        
        if c[2] in single_bayes_results.keys():
            continue
        else:
            model_sb = NBCategorical(smoothing_parameter=c[2])
            model_sb.fit(X_train, y_train)
            sb_list.append(accuracy_score(y_test, model_sb.predict(X_test)))
        
    rb_mean = np.mean(np.array(rb_list))
    rb_std = np.std(np.array(rb_list))
    random_bayes_results[c] = (rb_mean, rb_std)
    if c[2] in single_bayes_results.keys():
        continue
    else:
        sb_mean = np.mean(np.array(sb_list))
        sb_std = np.std(np.array(sb_list))
        single_bayes_results[c] = (sb_mean, sb_std)

IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (4233,) (5,) 

In [47]:
a = np.ones((10, 10))

In [48]:
a[[1, 3, 4], [5, 6, 7]]

array([1., 1., 1.])

In [49]:
a[np.array([1, 3, 4])[:, None], np.array[5, 6, 7]]

TypeError: 'builtin_function_or_method' object is not subscriptable