<a href="https://colab.research.google.com/github/sobhanshukueian/Machine-Learning-Projects-from-scratch/blob/main/Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Naive Bayes

## Bayes Theorem
Bayes’ Theorem finds the probability of an event occurring given the probability of another event that has already occurred. Bayes’ theorem is stated mathematically as the following equation:

bayes.svg

## Naive assumption

If any two events A and B are independent, then:

#### P(A,B) = P(A)P(B)

We know our features are independant, So we split evidence into the independent parts.

bayes1.svg

In [3]:
import numpy as np 
import pandas as pd 	
import matplotlib.pyplot as plt 
import math

# Dataset
### The dataset is divided into two parts, namely, feature matrix and the response vector.

Feature matrix contains all the vectors(rows) of dataset in which each vector consists of the value of dependent features. In above dataset, features are ‘Outlook’, ‘Temperature’, ‘Humidity’ and ‘Windy’.
Response vector contains the value of class variable(prediction or output) for each row of feature matrix. In above dataset, the class variable name is ‘Play’.

In [4]:
df = pd.read_table("/content/weather.txt")

X = df.drop([df.columns[-1]], axis = 1)
y = df[df.columns[-1]]

print("---------------------Weather Dataset-----------------------\n{}\n-------------------X------------------\n{}\n-----------------------y-----------------------\n{}\n----------------------------------------------".format(df, X, y))


#Split data
train_size = 9
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

---------------------Weather Dataset-----------------------
     Outlook  Temp Humidity Windy Play
0      Rainy   Hot     High     f   no
1      Rainy   Hot     High     t   no
2   Overcast   Hot     High     f  yes
3      Sunny  Mild     High     f  yes
4      Sunny  Cool   Normal     f  yes
5      Sunny  Cool   Normal     t   no
6   Overcast  Cool   Normal     t  yes
7      Rainy  Mild     High     f   no
8      Rainy  Cool   Normal     f  yes
9      Sunny  Mild   Normal     f  yes
10     Rainy  Mild   Normal     t  yes
11  Overcast  Mild     High     t  yes
12  Overcast   Hot   Normal     f  yes
13     Sunny  Mild     High     t   no
-------------------X------------------
     Outlook  Temp Humidity Windy
0      Rainy   Hot     High     f
1      Rainy   Hot     High     t
2   Overcast   Hot     High     f
3      Sunny  Mild     High     f
4      Sunny  Cool   Normal     f
5      Sunny  Cool   Normal     t
6   Overcast  Cool   Normal     t
7      Rainy  Mild     High     f
8      Rai

# Calculation of each labels probability

In [5]:
features = list(X_train.columns)
train_size = X_train.shape[0]
num_feats = X_train.shape[1]

likelihoods = {}
label_probabilities = {}
pred_priors = {}

for feature in features:
    likelihoods[feature] = {}
    pred_priors[feature] = {}

    for feat_val in np.unique(X_train[feature]):
        pred_priors[feature].update({feat_val: 0})

        for outcome in np.unique(y_train):
            likelihoods[feature].update({feat_val+'_'+outcome:0})
            label_probabilities.update({outcome: 0})

print(likelihoods)
print(label_probabilities)
print(pred_priors)

{'Outlook': {'Overcast_no': 0, 'Overcast_yes': 0, 'Rainy_no': 0, 'Rainy_yes': 0, 'Sunny_no': 0, 'Sunny_yes': 0}, 'Temp': {'Cool_no': 0, 'Cool_yes': 0, 'Hot_no': 0, 'Hot_yes': 0, 'Mild_no': 0, 'Mild_yes': 0}, 'Humidity': {'High_no': 0, 'High_yes': 0, 'Normal_no': 0, 'Normal_yes': 0}, 'Windy': {'f_no': 0, 'f_yes': 0, 't_no': 0, 't_yes': 0}}
{'no': 0, 'yes': 0}
{'Outlook': {'Overcast': 0, 'Rainy': 0, 'Sunny': 0}, 'Temp': {'Cool': 0, 'Hot': 0, 'Mild': 0}, 'Humidity': {'High': 0, 'Normal': 0}, 'Windy': {'f': 0, 't': 0}}


In [6]:
print("train_size: {}".format(train_size))

for label in np.unique(y_train):
    label_count = sum(y_train == label)
    label_probabilities[label] = label_count / train_size
    print("outcome_count: {}\tlabel_probability: {}".format(label_count, label_probabilities[label]))

train_size: 9
outcome_count: 4	label_probability: 0.4444444444444444
outcome_count: 5	label_probability: 0.5555555555555556


In [7]:
for feature in features:

    for label in np.unique(y_train):
        label_count = sum(y_train == label)
        feat_likelihood = X_train[feature][y_train[y_train == label].index.values.tolist()].value_counts().to_dict()
        for feat_val, count in feat_likelihood.items():
            likelihoods[feature][feat_val + '_' + label] = count/label_count

        print("feature : {}\n\tfeatures_label: {}\n\tlabel: {}\n\tlabel_count: {}\n\tlikelihood_probabilities: {}".format(feature, X_train[feature][y_train[y_train == label].index.values.tolist()].value_counts().to_dict(), label, label_count, likelihoods[feature]))


print(likelihoods)

feature : Outlook
	features_label: {'Rainy': 3, 'Sunny': 1}
	label: no
	label_count: 4
	likelihood_probabilities: {'Overcast_no': 0, 'Overcast_yes': 0, 'Rainy_no': 0.75, 'Rainy_yes': 0, 'Sunny_no': 0.25, 'Sunny_yes': 0}
feature : Outlook
	features_label: {'Overcast': 2, 'Sunny': 2, 'Rainy': 1}
	label: yes
	label_count: 5
	likelihood_probabilities: {'Overcast_no': 0, 'Overcast_yes': 0.4, 'Rainy_no': 0.75, 'Rainy_yes': 0.2, 'Sunny_no': 0.25, 'Sunny_yes': 0.4}
feature : Temp
	features_label: {'Hot': 2, 'Cool': 1, 'Mild': 1}
	label: no
	label_count: 4
	likelihood_probabilities: {'Cool_no': 0.25, 'Cool_yes': 0, 'Hot_no': 0.5, 'Hot_yes': 0, 'Mild_no': 0.25, 'Mild_yes': 0}
feature : Temp
	features_label: {'Cool': 3, 'Hot': 1, 'Mild': 1}
	label: yes
	label_count: 5
	likelihood_probabilities: {'Cool_no': 0.25, 'Cool_yes': 0.6, 'Hot_no': 0.5, 'Hot_yes': 0.2, 'Mild_no': 0.25, 'Mild_yes': 0.2}
feature : Humidity
	features_label: {'High': 3, 'Normal': 1}
	label: no
	label_count: 4
	likelihood_proba

In [8]:
for feature in features:
    feat_vals = X_train[feature].value_counts()
    for feat_val, count in feat_vals.items():
        pred_priors[feature][feat_val] = count/train_size
    print("feature: {}\n\ttrain_size: {}\n\tvalues: {}\n\tprobabilities: {}".format(feature, train_size, feat_vals.to_dict(), pred_priors))


feature: Outlook
	train_size: 9
	values: {'Rainy': 4, 'Sunny': 3, 'Overcast': 2}
	probabilities: {'Outlook': {'Overcast': 0.2222222222222222, 'Rainy': 0.4444444444444444, 'Sunny': 0.3333333333333333}, 'Temp': {'Cool': 0, 'Hot': 0, 'Mild': 0}, 'Humidity': {'High': 0, 'Normal': 0}, 'Windy': {'f': 0, 't': 0}}
feature: Temp
	train_size: 9
	values: {'Cool': 4, 'Hot': 3, 'Mild': 2}
	probabilities: {'Outlook': {'Overcast': 0.2222222222222222, 'Rainy': 0.4444444444444444, 'Sunny': 0.3333333333333333}, 'Temp': {'Cool': 0.4444444444444444, 'Hot': 0.3333333333333333, 'Mild': 0.2222222222222222}, 'Humidity': {'High': 0, 'Normal': 0}, 'Windy': {'f': 0, 't': 0}}
feature: Humidity
	train_size: 9
	values: {'High': 5, 'Normal': 4}
	probabilities: {'Outlook': {'Overcast': 0.2222222222222222, 'Rainy': 0.4444444444444444, 'Sunny': 0.3333333333333333}, 'Temp': {'Cool': 0.4444444444444444, 'Hot': 0.3333333333333333, 'Mild': 0.2222222222222222}, 'Humidity': {'High': 0.5555555555555556, 'Normal': 0.4444444444

# Calculate probabilities

bayes3.svg

In [9]:
results = []
X = np.array(X_test)

for index, query in enumerate(X):
    labels_prob = {}
    for label in np.unique(y_train):
        prior = label_probabilities[label]
        likelihood = 1
        evidence = 1

        for feat, feat_val in zip(features, query):
            likelihood *= likelihoods[feat][feat_val + '_' + label]
            evidence *= pred_priors[feat][feat_val]

        posterior = (likelihood * prior) / (evidence)

        labels_prob[label] = posterior
        print("Query: {}\noutcome: {}\nlikelihood: {}\nevidence: {}\nresult: {}\n".format(query, label, likelihood, evidence, labels_prob))
    print("y_test: {}\n\n--------------------------------------------------------".format(y_test.iloc[index]))


    result = max(labels_prob, key = lambda x: labels_prob[x])
    results.append(result)

accuracy = sum(results == y_test)/len(y_test) * 100
print("Test Accuracy: {}".format(accuracy))

Query: ['Sunny' 'Mild' 'Normal' 'f']
outcome: no
likelihood: 0.0078125
evidence: 0.021947873799725647
result: {'no': 0.15820312500000003}

Query: ['Sunny' 'Mild' 'Normal' 'f']
outcome: yes
likelihood: 0.03840000000000001
evidence: 0.021947873799725647
result: {'no': 0.15820312500000003, 'yes': 0.9720000000000005}

y_test: yes

--------------------------------------------------------
Query: ['Rainy' 'Mild' 'Normal' 't']
outcome: no
likelihood: 0.0234375
evidence: 0.014631915866483765
result: {'no': 0.7119140625000001}

Query: ['Rainy' 'Mild' 'Normal' 't']
outcome: yes
likelihood: 0.004800000000000001
evidence: 0.014631915866483765
result: {'no': 0.7119140625000001, 'yes': 0.18225000000000008}

y_test: yes

--------------------------------------------------------
Query: ['Overcast' 'Mild' 'High' 't']
outcome: no
likelihood: 0.0
evidence: 0.009144947416552354
result: {'no': 0.0}

Query: ['Overcast' 'Mild' 'High' 't']
outcome: yes
likelihood: 0.006400000000000002
evidence: 0.00914494741655