In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.stats as s

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

In [None]:
data

In [None]:
data.drop([data.columns[0],data.columns[32]],axis=1,inplace=True)

In [None]:
data

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe(include='all').T

# PCA (Eigen Value Decomposition)

In [None]:
labels=np.array(data['diagnosis']).reshape(data['diagnosis'].shape[0],1)

In [None]:
labels

In [None]:
x=np.array(data.iloc[:,1:])

In [None]:
x.shape

In [None]:
mu=np.mean(x,axis=0)

In [None]:
mu=mu.reshape(-1,mu.shape[0])

In [None]:
mu.shape

In [None]:
x_dash=x-mu

In [None]:
x_dash.shape

In [None]:
sigma_hat=(1/data.shape[0])*np.matmul(x_dash.T,x_dash)

In [None]:
sigma_hat.shape

In [None]:
sigma_hat_decompose=np.linalg.svd(sigma_hat)

In [None]:
len(sigma_hat_decompose)

In [None]:
Q=sigma_hat_decompose[0]

In [None]:
lmda=sigma_hat_decompose[1]

In [None]:
lmda

In [None]:
Q_tilda=Q[:,0:15]

In [None]:
Q_tilda.shape

In [None]:
x_new=np.matmul(x_dash,Q_tilda)

In [None]:
x_new.shape

In [None]:
new_data=pd.DataFrame(data=x_new)

In [None]:
new_data['diagnosis']=labels

In [None]:
new_data

In [None]:
new_data[new_data['diagnosis']=='B'].shape

In [None]:
new_data[new_data['diagnosis']=='M'].shape

# Obtain Training data

In [None]:
training_data_len=int(0.7*new_data.shape[0])

In [None]:
training_data_len

In [None]:
benign_training_data=new_data[new_data['diagnosis']=='B'].iloc[0:training_data_len//2]
malingnant_training_data=new_data[new_data['diagnosis']=='M'].iloc[0:training_data_len//2]

In [None]:
benign_training_data.shape

In [None]:
malingnant_training_data.shape

In [None]:
training_data=pd.concat([benign_training_data,malingnant_training_data])

In [None]:
training_data

# Obtain CV data

In [None]:
cv_data_len= int(0.2*new_data.shape[0])

In [None]:
cv_data_len

In [None]:
benign_remaining_data=new_data[new_data['diagnosis']=='B'].iloc[training_data_len//2:]
malingnant_remaining_data=new_data[new_data['diagnosis']=='M'].iloc[training_data_len//2:]

In [None]:
benign_remaining_data.shape

In [None]:
malingnant_remaining_data.shape

In [None]:
remaining_data=pd.concat([benign_remaining_data,malingnant_remaining_data])

In [None]:
remaining_data.shape

In [None]:
cv_data=remaining_data.iloc[0:cv_data_len]
testing_data=remaining_data.iloc[cv_data_len:]

In [None]:
testing_data.shape

# Gaussian Naive Bayes

**Now, in order to evaluate the likelihood probability** P(radiusmean = x | Diagnosis = M) P(texturemean = y | Diagnosis = M) which is given by Multivariate Joint Gauss
\begin{equation}
P(radiusmean = x | Diagnosis = M)P(texturemean = y | Diagnosis = M) = \left(\frac{1}{\sqrt{2\pi}\hat{\sigma_\text{rM}}}e^{-\frac{(x-\hat{\mu_\text{rM}})^2}{2\hat{\sigma_\text{rM}^2}}}\right)\left(\frac{1}{\sqrt{2\pi}\hat{\sigma_\text{tM}}}e^{-\frac{(y-\hat{\mu_\text{tM}})^2}{2\hat{\sigma_\text{tM}^2}}}\right)
\end{equation}

In [None]:
mu_hat_m=np.array(training_data[training_data['diagnosis']=='M'].iloc[:,0:15].mean())
sigma_hat_m=np.array(training_data[training_data['diagnosis']=='M'].iloc[:,0:15].cov())

**Prior Probability of Malingnant**
\begin{equation}
P(Diagnosis = M | radius mean = x) = 
P(radius mean = x | Diagnosis = M) P(texturemean = y| Diagnosis = M) P(Diagnosis = M)
\end{equation}

In [None]:
malingnant_prior=training_data[training_data['diagnosis']=='M'].shape[0]/training_data.shape[0]

 \begin{equation}
P(radiusmean = x | Diagnosis = B)P(texturemean = y | Diagnosis = B) = \left(\frac{1}{\sqrt{2\pi}\hat{\sigma_\text{rB}}}e^{-\frac{(x-\hat{\mu_\text{rB}})^2}{2\hat{\sigma_\text{rB}^2}}}\right)\left(\frac{1}{\sqrt{2\pi}\hat{\sigma_\text{tB}}}e^{-\frac{(y-\hat{\mu_\text{tB}})^2}{2\hat{\sigma_\text{tB}^2}}}\right)
\end{equation}

In [None]:
mu_hat_b=np.array(training_data[training_data['diagnosis']=='B'].iloc[:,0:15].mean())
sigma_hat_b=np.array(training_data[training_data['diagnosis']=='B'].iloc[:,0:15].cov())

**Prior Probability of Benign**
\begin{equation}
P(Diagnosis = B | radius mean = x) = P(radius mean = x | Diagnosis = B) P(texturemean = y| Diagnosis = B) P(Diagnosis = B)
\end{equation}

In [None]:
benign_prior=training_data[training_data['diagnosis']=='B'].shape[0]/training_data.shape[0]

# CV data evaluate

In [None]:
def mock_test(data):
    inputs=np.array(data.iloc[:,0:15])
    posterior_m=s.multivariate_normal.pdf(inputs,mu_hat_m,sigma_hat_m)*malingnant_prior
    posterior_b=s.multivariate_normal.pdf(inputs,mu_hat_b,sigma_hat_b)*benign_prior
    boolean_mask=posterior_m>posterior_b
    predicted_category=pd.Series(boolean_mask)
    predicted_category.replace(to_replace=[False,True],value=['B','M'],inplace=True)
    return np.array(predicted_category)

In [None]:
cv_results=mock_test(cv_data)

In [None]:
actual_results=np.array(cv_data['diagnosis'])

In [None]:
actual_results

In [None]:
boolean_mask=cv_results==actual_results

In [None]:
boolean_mask

In [None]:
np.count_nonzero(boolean_mask)

# CV Accuracy

In [None]:
cv_accuracy=np.count_nonzero(boolean_mask)/boolean_mask.shape[0]

In [None]:
cv_accuracy

# Test data evaluate

In [None]:
testing_results=mock_test(testing_data)

In [None]:
testing_results.shape

In [None]:
testing_results

In [None]:
actual_results=np.array(testing_data['diagnosis'])

In [None]:
actual_results

# Test Accuracy

In [None]:
testing_accuracy=np.count_nonzero(testing_results==actual_results)/actual_results.shape[0]

In [None]:
testing_accuracy

# Finish