In [1]:
### Naive Bayes : Movie Recommendation Program

import numpy as np

X_train = np.array([
    [0, 1, 1],
    [0, 0, 1],
    [0, 0, 0],
    [1, 1, 0]])
Y_train = ['Y', 'N', 'Y', 'Y']
X_test = np.array([[1, 1, 0]])

In [10]:
def get_label_indices(labels):
    """
    Group samples based on their labels and return indices
    @param labels: list of labels
    @return: dict, {class1: [indices], class2: [indices]}
    """
    from collections import defaultdict
    label_indices = defaultdict(list)
    for index, label in enumerate(labels):
        label_indices[label].append(index)
    return label_indices

In [24]:
def get_prior(label_indices):
    """
    Calculate prior probabilities for each label
    @param label_indices: dict, {class1: [indices], class2: [indices]}
    @return: dict, {class1: prior1, class2: prior2}
    """
    prior = {label: len(indices) for label, indices in
                                      label_indices.items()} 
    total_count = sum(prior.values())
    for label in prior:
        prior[label] /= total_count
    return prior

In [26]:
label_indices = get_label_indices(Y_train)

likelihood  = {}

for label, indices in label_indices.items():
    likelihood[label] = X_train[indices, :].sum(axis=0)

for label, indices in label_indices.items():
    print(f"Label: {label}, Indices: {indices}")

Label: Y, Indices: [0, 2, 3]
Label: N, Indices: [1]


In [28]:
X_train[0, :].sum(axis=0)

2

In [29]:
def get_likelihood(features, label_indices, smoothing=0):
    """
    Compute likelihood based on training samples
    @param features: matrix of features
    @param label_indices: grouped sample indices by class
    @param smoothing: integer, additive smoothing parameter
    @return: dictionary, with class as key, corresponding
              conditional probability P(feature|class) vector 
              as value
    """
    likelihood = {}
    for label, indices in label_indices.items():
        likelihood[label] = features[indices, :].sum(axis=0)
        total_count = len(indices)
        likelihood[label] = likelihood[label] / (total_count + 2 * smoothing)
    return likelihood


In [30]:
get_likelihood(X_train, label_indices, smoothing=1)

{'Y': array([0.2, 0.4, 0.2]), 'N': array([0.        , 0.        , 0.33333333])}