In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
# loading sample data
df = pd.read_csv('~/Documents/sample.csv')
df.head(n = df.shape[0])

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,yes,no
1,sunny,hot,high,no,no
2,overcast,hot,high,no,yes
3,rainy,mild,high,no,yes
4,rainy,cool,normal,no,yes
5,rainy,cool,normal,yes,no
6,overcast,cool,normal,yes,yes
7,sunny,mild,high,no,no
8,sunny,cool,normal,no,yes
9,rainy,mild,normal,no,yes


In [3]:
# finding the posterior likelihood of the hypothesis
def posterior_likelihood(df, X, label):
    prior_prob = df[df['Play'] == label].shape[0] / df.shape[0]
    post_prob = prior_prob*likelihood(df, X, label)
    return post_prob

def likelihood(df, X, label):
    l = 1.0
    for ix in range(X.shape[0]):
        conditional_prob = df[(df[df.columns[ix]] == X[ix]) & (df['Play'] == label)].shape[0] / df[df['Play'] == label].shape[0]
        # lagrange correction 
        if conditional_prob == 0.0:
            conditional_prob = 1.0 / (df[df['Play'] == label].shape[0] + np.unique(df[df.columns[ix]]).shape[0])
        l *= conditional_prob
    return l

In [4]:
# predict function - naive bayes algorithm
def predict(df, X):
    likelihoods = []
    labels = np.unique(df['Play'])
    for label in labels:
        likelihoods.append(posterior_likelihood(df, X, label))
    pred = labels[np.array(likelihoods).argmax()]
    return pred, likelihoods

In [5]:
# predicting output for X_test_01
X_test_01 = np.array(['sunny', 'cool', 'high', 'yes'])
pred, likelihoods = predict(df, X_test_01)
print('X_test = ', X_test_01)
print()
print('Likelihood of \'no\' = ', likelihoods[0])
print('Likelihood of \'yes\' = ', likelihoods[1])
print()
print('Prediction = ', pred)

X_test =  ['sunny' 'cool' 'high' 'yes']

Likelihood of 'no' =  0.02057142857142857
Likelihood of 'yes' =  0.005291005291005291

Prediction =  no


In [6]:
# predicting output for X_test_02
X_test_02 = np.array(['rainy', 'mild', 'normal', 'no'])
pred, likelihoods = predict(df, X_test_02)
print('X_test = ', X_test_02)
print()
print('Likelihood of \'no\' = ', likelihoods[0])
print('Likelihood of \'yes\' = ', likelihoods[1])
print()
print('Prediction = ', pred)

X_test =  ['rainy' 'mild' 'normal' 'no']

Likelihood of 'no' =  0.004571428571428573
Likelihood of 'yes' =  0.042328042328042326

Prediction =  yes


In [7]:
# predicting output for X_test_03
X_test_03 = np.array(['overcast', 'mild', 'normal', 'no'])
pred, likelihoods = predict(df, X_test_03)
print('X_test = ', X_test_03)
print()
print('Likelihood of \'no\' = ', likelihoods[0])
print('Likelihood of \'yes\' = ', likelihoods[1])
print()
print('Prediction = ', pred)

X_test =  ['overcast' 'mild' 'normal' 'no']

Likelihood of 'no' =  0.001428571428571429
Likelihood of 'yes' =  0.056437389770723094

Prediction =  yes
