## Gaussian Naive Bayes

In [66]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB

In [67]:
data = pd.read_csv('C://MyFolder//iris.csv')
data.head(5)


Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,4.7,3.2,1.3,0.2,1
1,4.6,3.1,1.5,0.2,1
2,5.0,3.6,1.4,0.2,1
3,5.4,3.9,1.7,0.4,1
4,4.6,3.4,1.4,0.3,1


In [68]:
X = data[['sepallength', 'sepalwidth','petallength','petalwidth']]

In [69]:
y = data[['class']]

#### WAY#1: Not mentioning priors. Priors is calculated by simply counting the number of different labels in your training sample.

In [70]:
gnb = GaussianNB()
gnb.fit(X, y.values.ravel())
new_observation = [[ 4,  4,  4,  0.4]]
gnb.predict(new_observation)

array([2], dtype=int64)

To know the Priors:

In [71]:
print(gnb.class_prior_)

[0.32432432 0.33783784 0.33783784]


#### WAY#2: Mentioning priors. Priors is calculated by simply counting the number of different labels in your training sample.

In [72]:
gnb = GaussianNB(priors=[0.25, 0.25, 0.5])
gnb.fit(X, y.values.ravel())
new_observation = [[ 4,  4,  4,  0.4]]
gnb.predict(new_observation)

array([2], dtype=int64)

**P.S** Below in this notebook, we will see how to manually calculate the **priors**.

## To demistify the above computation, lets see how Gaussian Naive Bayes works with Numericals.

In [73]:
data = pd.DataFrame()

# Create our target variable
data['Gender'] = ['male','male','male','male','female','female','female','female']

# Create our feature variables
data['Height'] = [6,5.92,5.58,5.92,5,5.5,5.42,5.75]
data['Weight'] = [180,190,170,165,100,150,130,150]
data['Foot_Size'] = [12,11,12,10,6,8,7,9]

# View the data
data.head(8)

Unnamed: 0,Gender,Height,Weight,Foot_Size
0,male,6.0,180,12
1,male,5.92,190,11
2,male,5.58,170,12
3,male,5.92,165,10
4,female,5.0,100,6
5,female,5.5,150,8
6,female,5.42,130,7
7,female,5.75,150,9


In [74]:
# Create an empty dataframe for new_observation
new_observation = pd.DataFrame()

# Create some feature values for this single row
new_observation['Height'] = [6]
new_observation['Weight'] = [130]
new_observation['Foot_Size'] = [8]

# View the data 
new_observation.head()

Unnamed: 0,Height,Weight,Foot_Size
0,6,130,8


#### The formula for Gaussian Naive Bayes:

![image](https://user-images.githubusercontent.com/45539698/52260838-5530e880-294d-11e9-8221-a0eddc041051.png)


Understanding the meaning of each components with respect to the example:

![image](https://user-images.githubusercontent.com/45539698/52270470-1e67cc00-2967-11e9-9fe1-05108aca28b4.png)


In the above diagram:

- **GREEN** and **BLUE** are the **LIKELIHOOD**. Similarl computation needs to be done for females. 
- We can ignore the marginal probability (denominator).

**NOTE:** For IRIS, replace height, weight, foot size with sepallength, sepalwidth, petallength, petalwidth, and male, female with IRIS' classes.

In [75]:
#Calculate Priors

# Number of males
n_male = data['Gender'][data['Gender'] == 'male'].count()

# Number of males
n_female = data['Gender'][data['Gender'] == 'female'].count()

# Total rows
total_ppl = data['Gender'].count()


# Number of males divided by the total rows
P_male = n_male/total_ppl

# Number of females divided by the total rows
P_female = n_female/total_ppl

In [76]:
# Group the data by gender and calculate the means of each feature
data_means = data.groupby('Gender').mean()

# View the values
data_means


Unnamed: 0_level_0,Height,Weight,Foot_Size
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,5.4175,132.5,7.5
male,5.855,176.25,11.25


In [77]:

# Group the data by gender and calculate the variance of each feature
data_variance = data.groupby('Gender').var()

# View the values
data_variance

Unnamed: 0_level_0,Height,Weight,Foot_Size
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.097225,558.333333,1.666667
male,0.035033,122.916667,0.916667


In [78]:
# Means for male
male_height_mean = data_means['Height'][data_variance.index == 'male'].values[0]
male_weight_mean = data_means['Weight'][data_variance.index == 'male'].values[0]
male_footsize_mean = data_means['Foot_Size'][data_variance.index == 'male'].values[0]

# Variance for male
male_height_variance = data_variance['Height'][data_variance.index == 'male'].values[0]
male_weight_variance = data_variance['Weight'][data_variance.index == 'male'].values[0]
male_footsize_variance = data_variance['Foot_Size'][data_variance.index == 'male'].values[0]

# Means for female
female_height_mean = data_means['Height'][data_variance.index == 'female'].values[0]
female_weight_mean = data_means['Weight'][data_variance.index == 'female'].values[0]
female_footsize_mean = data_means['Foot_Size'][data_variance.index == 'female'].values[0]

# Variance for female
female_height_variance = data_variance['Height'][data_variance.index == 'female'].values[0]
female_weight_variance = data_variance['Weight'][data_variance.index == 'female'].values[0]
female_footsize_variance = data_variance['Foot_Size'][data_variance.index == 'female'].values[0]

In [79]:
# Create a function that calculates p(x | y):
def p_x_given_y(x, mean_y, variance_y):

    # Input the arguments into a probability density function
    p = 1/(np.sqrt(2*np.pi*variance_y)) * np.exp((-(x-mean_y)**2)/(2*variance_y))
    
    # return p
    return p

In [80]:
# Numerator of the posterior if the unclassified observation is a male
P_male * p_x_given_y(new_observation['Height'][0], male_height_mean, male_height_variance) * p_x_given_y(new_observation['Weight'][0], male_weight_mean, male_weight_variance) * p_x_given_y(new_observation['Foot_Size'][0], male_footsize_mean, male_footsize_variance)

6.197071843878078e-09

In [82]:
# Numerator of the posterior if the unclassified observation is a female
P_female * p_x_given_y(new_observation['Height'][0], female_height_mean, female_height_variance) * p_x_given_y(new_observation['Weight'][0], female_weight_mean, female_weight_variance) * p_x_given_y(new_observation['Foot_Size'][0], female_footsize_mean, female_footsize_variance)

0.0005377909183630018

Calculating Percentages

In [85]:
(6.1970/(6.1970 + 0.000537))*100

99.99133526754258

In [86]:
(0.000537/(6.1970 + 0.000537))*100

0.008664732457426234