#### Naive Bayes Classifier

In [1]:
import numpy as np
import pandas as pd

In [2]:
# A sample dataset to decide whether we watch a movie or not
# based on the columns: genre, outlook, subtitles, language
dataset = pd.read_csv('data.csv', header = 0)
features = np.array(dataset.columns[:-1])

In [3]:
print('Dataset:', dataset, sep = '\n')

Dataset:
      Genre   Outlook Subtitles Language Watch
0    Action     Sunny       Yes  English    No
1   Romance  Overcast        No    Hindi    No
2   Romance  Overcast        No  English   Yes
3    Comedy     Sunny       Yes    Hindi    No
4    Comedy  Overcast       Yes  English    No
5    Action     Rainy        No    Hindi    No
6   Romance     Sunny       Yes  English   Yes
7    Action     Rainy       Yes    Hindi   Yes
8    Action  Overcast        No  English    No
9   Romance     Sunny       Yes    Hindi    No
10   Comedy     Rainy        No    Hindi   Yes
11  Romance  Overcast       Yes    Hindi    No
12   Comedy     Rainy        No  English   Yes
13   Comedy  Overcast        No    Hindi   Yes


In [4]:
print('Features:', features)

Features: ['Genre' 'Outlook' 'Subtitles' 'Language']


In [5]:
def frequency_table(dataset: pd.DataFrame, feature: str) -> pd.DataFrame:
    F = np.append(dataset[feature].unique(), 'Total')
    Y = np.append(dataset.iloc[:, -1].unique(), 'Total')
    table = pd.DataFrame(data = np.zeros((len(F), len(Y))), index = F, columns = Y, dtype = int)
    for _, row in dataset.iterrows():
        table[row[dataset.columns[-1]]][row[feature]] += 1
        table[row[dataset.columns[-1]]]['Total'] += 1
        table['Total'][row[feature]] += 1
        table['Total']['Total'] += 1
    return table

In [6]:
tables = dict(zip(features, [frequency_table(dataset, f) for f in features]))

In [7]:
print('Frequency Tables: \n')
for feature in tables:
    print(f'{feature}: \n{tables[feature]}\n')

Frequency Tables: 

Genre: 
         No  Yes  Total
Action    3    1      4
Romance   3    2      5
Comedy    2    3      5
Total     8    6     14

Outlook: 
          No  Yes  Total
Sunny      3    1      4
Overcast   4    2      6
Rainy      1    3      4
Total      8    6     14

Subtitles: 
       No  Yes  Total
Yes     5    2      7
No      3    4      7
Total   8    6     14

Language: 
         No  Yes  Total
English   3    3      6
Hindi     5    3      8
Total     8    6     14



In [8]:
def predict(test: pd.DataFrame) -> pd.DataFrame:
    total, Y = 0, dataset.iloc[:, -1].unique()
    result = pd.DataFrame(np.ones((len(Y), 1)), index = Y, columns = ['Probability'], dtype = float)
    for y in Y:
        for f in features:
            result['Probability'][y] *= tables[f][y][test['Test'][f]] / tables[f][y]['Total']
        total += result['Probability'][y]
    for _, ry in result.iterrows():
        ry['Probability'] /= total
    return result

In [9]:
test = pd.DataFrame(data = ['Action', 'Sunny', 'No', 'Hindi'], index = features, columns = ['Test'])
pred = predict(test)

In [10]:
print(pred)
print('\nPrediction:', pred['Probability'].idxmax())

     Probability
No      0.780681
Yes     0.219319

Prediction: No
