# Naive Bayes using math calculations

In [1]:
import pandas as pd  # pandas module for data manipulation
import matplotlib.pyplot as plt # module for plotting
import seaborn as sns # another module for plotting
import numpy as np
from sklearn.naive_bayes import GaussianNB #Import Library of Gaussian Naive Bayes model
from sklearn.metrics import confusion_matrix #import confusion_matrix
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler #fot standardidazation
from sklearn.model_selection import GridSearchCV #import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

plt.style.use('ggplot')

# to display the total number columns present in the dataset
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
# let's load the dataset
# data = pd.read_csv('liver_disease_1.csv')

data = pd.read_csv('Cancer_Dataset.csv')
# let's inspect the first 5 rows
data.head()

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0.7,0.1,187,16,18,6.8,3.3,0.9,Yes
1,62,10.9,5.5,699,64,100,7.5,3.2,0.74,Yes
2,62,7.3,4.1,490,60,68,7.0,3.3,0.89,Yes
3,58,1.0,0.4,182,14,20,6.8,3.4,1.0,Yes
4,72,3.9,2.0,195,27,59,7.3,2.4,0.4,Yes


In [3]:
data.shape

(583, 10)

In [4]:
data.dtypes

Age                             int64
Total_Bilirubin               float64
Direct_Bilirubin              float64
Alkaline_Phosphotase            int64
Alamine_Aminotransferase        int64
Aspartate_Aminotransferase      int64
Total_Protiens                float64
Albumin                       float64
Albumin_and_Globulin_Ratio    float64
Dataset                        object
dtype: object

In [5]:
# Can Suger be 0, blood pressure be 0 ? Similarly for Skin Thickness, Insulin,BMI and age.
# So, replacing those attributes which has O(zero) values to the mean of that column.

# for i, feature in enumerate(data.columns):
#     if feature not in ("Pregnancies","Cancer_Markers","Outcome"):
#         data[feature].replace(0,data[feature].median(axis=0),inplace=True)
        
# data

# For the better evaluation we are replacing categorical values to numerical values
# data["Dataset"] = data["Dataset"].replace("Yes",1)
# data["Dataset"] = data["Dataset"].replace("No",0)

In [6]:
labels = ['low', 'medium', 'high']
for j in data.columns[:-1]:
    mean = data[j].mean()
    data[j]=data[j].replace(0,mean) #Replacing 0 with mean
    data[j] = pd.cut(data[j],bins=len(labels), labels=labels)

In [7]:
def count(data, colname, label, target):
    condition = (data[colname] == label) & (data['Outcome'] == target)
    return len(data[condition])

In [8]:
probabilities = {0:{}, 1:{}}

In [9]:
train_percent = 75
train_len = int((train_percent*len(data))/100)
train_X = data.iloc[:train_len,:]
test_X = data.iloc[train_len+1:,:-1]
test_y = data.iloc[train_len+1:,-1]

count_0 = count(train_X,'Outcome', 0,0)
count_1 = count(train_X,'Outcome', 1,1)

prob_0 = count_0/len(train_X)
prob_1 = count_1/len(train_X)

### Calculate Likelihood probabilities

In [10]:
for col in train_X.columns[:-1]:
    probabilities[0][col] = {}
    probabilities[1][col] = {}    
    
    for category in labels:
        count_ct_0 = count(train_X, col, category, 0)
        count_ct_1 = count(train_X, col, category, 1)
        
        probabilities[0][col][category] = count_ct_0/count_0
        probabilities[1][col][category] = count_ct_1/count_1



In [11]:
predict = []
for row in range(0,len(test_X)):
    prod_0 = prob_1
    prod_1 = prob_1
    for feature in test_X.columns:
        prod_0 *= probabilities[0][feature][test_X[feature].iloc[row]]
        prod_1 *= probabilities[1][feature][test_X[feature].iloc[row]]
        
        
    #Predict the outcome
    if prod_0 > prod_1:
        predict.append(0)
    else:
        predict.append(1)


In [14]:
tp,tn,fp,fn = 0,0,0,0
for j in range(0,len(predict)):
    if predict[j] == 0:
        if test_y.iloc[j] == 0:
            tp+=1
        else:
            fp+=1
    else:
        if test_y.iloc[j] == 1:
            tn+=1
        else:
            fn+=1
            
print("tp,tn,fp,fn", tp,tn,fp,fn)

tp,tn,fp,fn 19 85 22 19


In [13]:
print('Accuracy for training length: '+str(train_percent)+'%  :',((tp+tn)/len(test_y))*100)

Accuracy for training length: 75%  : 71.72413793103448
