In [1]:
import sys

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.naive_bayes import BaseNB

sys.path.append('../naive-bayes')

from naive_bayes import *

In [6]:
wine = datasets.load_wine()

In [7]:
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target)

X.shape

(178, 13)

In [8]:
gnb = GeneralNB()
gnb.fit(X, y)

In [21]:
np.array(gnb.class_probs.values())

array(dict_values([0.33146067415730335, 0.398876404494382, 0.2696629213483146]),
      dtype=object)

In [28]:
gnb.cond_probs

{0: {'alcohol': {'mean': 13.744745762711865, 'std': 0.45819230635257235},
  'malic_acid': {'mean': 2.010677966101695, 'std': 0.6826887630111},
  'ash': {'mean': 2.4555932203389834, 'std': 0.22523261938580774},
  'alcalinity_of_ash': {'mean': 17.037288135593222, 'std': 2.524651229820095},
  'magnesium': {'mean': 106.33898305084746, 'std': 10.409594937480737},
  'total_phenols': {'mean': 2.8401694915254234, 'std': 0.3360765223870026},
  'flavanoids': {'mean': 2.982372881355932, 'std': 0.3941106227867916},
  'nonflavanoid_phenols': {'mean': 0.29000000000000004,
   'std': 0.06945306914427413},
  'proanthocyanins': {'mean': 1.8993220338983048, 'std': 0.4086018509078915},
  'color_intensity': {'mean': 5.528305084745763, 'std': 1.2280315688295402},
  'hue': {'mean': 1.0620338983050848, 'std': 0.11549128165814294},
  'od280/od315_of_diluted_wines': {'mean': 3.1577966101694916,
   'std': 0.3540375724416456},
  'proline': {'mean': 1115.7118644067796, 'std': 219.63544918159718}},
 1: {'alcohol': 

In [39]:
def _vectorized_get(key, dict_):
    return np.vectorize(dict__getitem__)(key)

class_probs_arr = np.atleast_2d([val for val in gnb.class_probs.values()])
jll = np.repeat(np.log(class_probs_arr), gnb.n_samples, axis=0)

for feat in gnb.categorical_columns:
    log_probs = np.log(np.stack([_vectorized_get(X[col].values, self.cond_probs[i][feat]) for i in self.classes_], axis=1))
    # impute log cond prob to 0 in case of 0% cond prob in training set
    # TODO: this should probably be a toggle option in case user wants to throw an error on unseen values
    log_probs[log_probs == -np.inf] = 0
    
    jll += log_probs
    

for col in gnb.numerical_columns:
    log_probs_by_class = []
    # willing to allow a O(n_features * n_classes) loop for now
    for label in gnb.classes_:
        mean = gnb.cond_probs[label][col]['mean']
        std = gnb.cond_probs[label][col]['std']
        
        # impute missing values to 0
        log_probs = np.log(gaussian_pdf(X[col].values, mean, std))
        log_probs_0filled = np.nan_to_num(log_probs)
        
        log_probs_by_class.append(log_probs_0filled)
    log_probs_array = np.stack(log_probs_by_class, axis=1)
    jll += log_probs_array
jll

array([[ -16.13977307,  -38.86047164, -108.64310897],
       [ -17.36976848,  -28.27790401,  -85.23142225],
       [ -15.01604426,  -34.08143524,  -95.05934416],
       [ -22.44296814,  -64.07144757, -135.11431529],
       [ -18.55127559,  -21.871907  ,  -60.65813409],
       [ -15.12061381,  -51.97139539, -112.09222493],
       [ -14.80380948,  -40.51635072,  -89.26591822],
       [ -15.39047203,  -38.62619499,  -86.68630159],
       [ -15.86996702,  -37.42553527,  -83.14005468],
       [ -14.13594558,  -39.38469492,  -95.01032172],
       [ -15.70026383,  -49.4025116 , -123.65846774],
       [ -16.3785873 ,  -35.4712836 ,  -70.4569399 ],
       [ -13.48786562,  -36.91787518,  -82.25197239],
       [ -25.09373277,  -46.29311103, -124.61697005],
       [ -23.32654789,  -65.71242339, -148.14335805],
       [ -15.42295775,  -44.85838535,  -90.85654801],
       [ -15.1933934 ,  -43.15414427,  -86.67467257],
       [ -15.90972466,  -37.70307374,  -85.99166735],
       [ -23.25969242,  -74.

In [27]:
class_probs = dict()
cond_probs = dict()
for label in gnb.classes_:
    class_probs[label] = sum(gnb.y == label) / gnb.n_samples

    cond_probs[label] = dict()
    for col in gnb.numerical_columns:
        cond_probs[label][col] = dict()
        cond_df = X.loc[y == label, col].values
        cond_probs[label][col]['mean'] = np.nanmean(cond_df)
        cond_probs[label][col]['std'] = np.nanstd(cond_df)

In [28]:
cond_probs

{0: {'alcohol': {'mean': 13.744745762711865, 'std': 0.45819230635257235},
  'malic_acid': {'mean': 2.010677966101695, 'std': 0.6826887630111},
  'ash': {'mean': 2.4555932203389834, 'std': 0.22523261938580774},
  'alcalinity_of_ash': {'mean': 17.037288135593222, 'std': 2.524651229820095},
  'magnesium': {'mean': 106.33898305084746, 'std': 10.409594937480737},
  'total_phenols': {'mean': 2.8401694915254234, 'std': 0.3360765223870026},
  'flavanoids': {'mean': 2.982372881355932, 'std': 0.3941106227867916},
  'nonflavanoid_phenols': {'mean': 0.29000000000000004,
   'std': 0.06945306914427413},
  'proanthocyanins': {'mean': 1.8993220338983048, 'std': 0.4086018509078915},
  'color_intensity': {'mean': 5.528305084745763, 'std': 1.2280315688295402},
  'hue': {'mean': 1.0620338983050848, 'std': 0.11549128165814294},
  'od280/od315_of_diluted_wines': {'mean': 3.1577966101694916,
   'std': 0.3540375724416456},
  'proline': {'mean': 1115.7118644067796, 'std': 219.63544918159718}},
 1: {'alcohol': 

In [31]:
X.loc[y == 2, 'proline'].std()

115.09704315911696

In [35]:
a = int()
a

0

In [39]:
one = lambda: 1
one()

1

In [33]:
help(defaultdict)

Help on class defaultdict in module collections:

class defaultdict(builtins.dict)
 |  defaultdict(default_factory[, ...]) --> dict with default factory
 |  
 |  The default factory is called without arguments to produce
 |  a new value when a key is not present, in __getitem__ only.
 |  A defaultdict compares equal to a dict with the same items.
 |  All remaining arguments are treated the same as if they were
 |  passed to the dict constructor, including keyword arguments.
 |  
 |  Method resolution order:
 |      defaultdict
 |      builtins.dict
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __copy__(...)
 |      D.copy() -> a shallow copy of D.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __init__(self, /, *args, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __missing__(...)
 |      __missing__(key) # Called by __getitem__ for missing key; pseudo-code:
 |      if self.default_facto

In [42]:
from collections import defaultdict

t = defaultdict(lambda: 1)
t[13] = 5

t[13]

5

In [43]:
t[4]

1

In [44]:
t

defaultdict(<function __main__.<lambda>()>, {13: 5, 4: 1})

In [46]:
np.log(0)

  """Entry point for launching an IPython kernel.


-inf