# Gaussian Naive Bayes For Single Random Variable

In [1]:
import numpy as np

import pandas as pd

import scipy.stats as s

import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
data = pd.read_csv("data.csv")

data.drop([data.columns[0], data.columns[32]], axis =1, inplace = True)

data = data.iloc[:,0:2]

data.head()

Unnamed: 0,diagnosis,radius_mean
0,M,17.99
1,M,20.57
2,M,19.69
3,M,11.42
4,M,20.29


In [3]:
training_data_len = int(0.7*data.shape[0])

benign_training_data = data[data['diagnosis'] == 'B'].iloc[0:training_data_len//2]

malignant_training_data = data[data['diagnosis'] == 'M'].iloc[0:training_data_len//2]

training_data = pd.concat([benign_training_data, malignant_training_data])

In [4]:
cv_data_len = int(0.2 * data.shape[0])

begning_remaining_data = data[data['diagnosis'] == 'B'].iloc[training_data_len//2:]

malignant_remaining_data = data[data['diagnosis'] == 'M'].iloc[training_data_len//2:]

remaining_data = pd.concat([begning_remaining_data,malignant_remaining_data])

cv_data = remaining_data.iloc[0:cv_data_len]

testing_data = remaining_data.iloc[cv_data_len:]

So now we need to evaluate the following Probability:

\begin{equation}
P(Diagnosis = M | radius mean = x) = P(radius mean = x | Diagnosis = M)\cdot P(Diagnosis = M)
\end{equation}

Now in order to evaluate the likelihood probability **P(radiusmean = x | Diagnosis = M)** which is given by :

\begin{equation}
P(radiusmean = x | Diagnosis = M)= \left(\frac{1}{\sqrt{2\pi}\hat{\sigma_\text{rM}}}e^{-\frac{(x-\mu_\text{rM})^2}{2\sigma_\text{rM}^2}}\right)
\end{equation}

 Now, for this PDF, we need to find out **the best estimate of the parameters of Normal Distribution** because we are assuming that our malignant tumor training data is being sampled from a Normal (gaussian) Distribution. The two parameters will be namely: *mu_hat_rM & sigma_hat_rM*

In [5]:
mu_hat_rm = training_data[training_data['diagnosis'] == 'M']['radius_mean'].mean()

sigma_hat_rm = training_data[training_data['diagnosis'] == 'M']['radius_mean'].std()

malignant_prior = training_data[training_data['diagnosis'] == 'M'].shape[0]/training_data.shape[0]

So now we need to evaluate the following Probability:

\begin{equation}
P(Diagnosis = B | radius mean = x) = P(radius mean = x | Diagnosis = B)\cdot P(Diagnosis = M)
\end{equation}

Now in order to evaluate the likelihood probability **P(radiusmean = x | Diagnosis = B)** which is given by:



\begin{equation}
P(radiusmean = x | Diagnosis = B)= \left(\frac{1}{\sqrt{2\pi}\hat{\sigma_\text{rB}}}e^{-\frac{(x-\mu_\text{rB})^2}{2\sigma_\text{rB}^2}}\right)
\end{equation}


Now, for this PDF, we need to find out **the best estimate of the parameters of Normal Distribution** because we are assuming that our malignant tumor training data is being sampled from a Normal (gaussian) Distribution. The two parameters will be namely: *mu_hat_rB & sigma_hat_rB*

In [6]:
mu_hat_rb = training_data[training_data['diagnosis'] == 'B']['radius_mean'].mean()

sigma_hat_rb = training_data[training_data['diagnosis'] == 'B']['radius_mean'].std()

benign_prior = training_data[training_data['diagnosis'] == 'B'].shape[0]/training_data.shape[0]

In [7]:
def cv_data_testing(data):
    
    inputs = np.array(data['radius_mean'])
    
    posterior_m = s.norm.pdf(inputs, mu_hat_rm, sigma_hat_rm) * malignant_prior
    
    posterior_b = s.norm.pdf(inputs, mu_hat_rb, sigma_hat_rb) * benign_prior
    
    boolean_mask = posterior_m > posterior_b
    
    predicted_category = pd.Series(boolean_mask)
    
    predicted_category.replace(to_replace=[False, True], value = ['B', 'M'], inplace=True)
    
    return np.array(predicted_category)

In [8]:
cv_results = cv_data_testing(cv_data)

cv_results

array(['B', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B',
       'M', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M',
       'M', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'M', 'B', 'B', 'M',
       'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M'], dtype=object)

In [9]:
actual_results = np.array(cv_data['diagnosis'])

boolean_mask = cv_results == actual_results

boolean_mask

array([ True,  True,  True, False,  True,  True, False,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
        True, False,  True,  True, False,  True,  True, False,  True,
        True,  True,  True,  True, False])

In [10]:
cv_accuracy = np.count_nonzero(boolean_mask)/boolean_mask.shape[0]

cv_accuracy

0.8495575221238938

In [11]:
testing_results = cv_data_testing(testing_data)

testing_results

array(['B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'M', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'M', 'B', 'B', 'B', 'B', 'M', 'M', 'M', 'M', 'M', 'M', 'B',
       'M', 'M', 'M', 'M', 'M', 'M'], dtype=object)

In [12]:
actual_testing_results = np.array(testing_data['diagnosis'])

actual_testing_results

array(['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M'], dtype=object)

In [13]:
testing_accuracy = np.count_nonzero(testing_results == actual_testing_results)/actual_testing_results.shape[0]

testing_accuracy

0.896551724137931