<a href="https://colab.research.google.com/github/ridwanmahendra/Gaussian-Naive-Bayes/blob/main/Gaussian_NB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
class NB:
    def __init__(self, target, dataframe):
        self.df = dataframe
        self.c_n = target
        self.cols = list(self.df.columns)
        self.cols.remove(self.c_n)
        self.rv = {}
        self.determine_rv_for_all()
        self.store = {}
        self.discrete_likelihood_for_all()
        
    def discrete_likelihood_cal(self, x, y, z):
        df = self.df
        
        if x not in self.cols:
            raise KeyError("Fitur (kolom) tidak ada di Dataset Pelatihan")
        res = (1+len(df[(df[x] == y) & (df[self.c_n] == z)])) /(len(df[df[self.c_n] == z]) + len(df[x].unique()))
        """if res == 0.0:
            return 1/(len(df[df[self.c_n] == z]) + len(df[x].unique()))"""
        return res
    
    def discrete_likelihood_for_all(self):     
        df = self.df
        discrete_cols = [x for x in self.cols if self.rv[x] == 'discrete']
        dict1 = {}
        for x in discrete_cols:
            dict2 = {}
            for y in df[x].unique():
                dict3 = {}
                for z in df[self.c_n].unique():
                    #print('P({}="{}"|{}="{}") = {}'.format(x,y,self.c_n,z,self.discrete_likelihood_cal(x, y, z)))
                    dict3[z] = self.discrete_likelihood_cal(x, y, z)
                dict2[y] = dict3
            dict1[x] = dict2
        self.store = dict1
        
    def determine_rv(self, x):
        df = self.df
        val = list(df[x])[0]
        if type(val) == str or (type(val) == int and len(df[x].unique()) < len(df[x])):
            return 'discrete'
        return 'continuous'
    
    def determine_rv_for_all(self):
        self.rv = {x:self.determine_rv(x) for x in self.cols}

    def normal_pdf(self, sample, x=None):
        mu = np.mean(sample)
        sigma = np.std(sample)
        if x == None:
            x = sample
        expr = np.exp((-1/2)*(((x-mu)/sigma)**2))/(np.sqrt(2*np.pi*sigma))
        return expr

    def continuous_likelihood_cal(self, column_name, column_val, class_val):
        df = self.df
        sample = df[df[self.c_n] == class_val][column_name]
        return self.normal_pdf(sample, column_val)
    
    def likelihood_expr(self, class_val, expr):
        val = 1  
        for k,v in expr:
            if k not in self.cols:
                raise KeyError("Fitur (kolom) tidak ada di Dataset Pelatihan")
            if self.rv[k] == 'discrete':
                try:
                    store_val = self.store[k][v][class_val]
                except:
                    store_val = self.discrete_likelihood_cal(k,v,class_val)
            else:
                store_val = self.continuous_likelihood_cal(k,v,class_val)
            val *= store_val                    
        return val
    
    def prior(self, class_val):
        df = self.df
        # print("Prior : ", len(df[df[self.c_n] == class_val])/df.shape[0])
        return len(df[df[self.c_n] == class_val])/df.shape[0]
        
    
    
    def predict(self, X):
        df = self.df
        if type(X) == pd.core.series.Series:
            values_list = [list(X.items())]
        elif type(X) == pd.core.frame.DataFrame:
            values_list = [list(y.items()) for x,y in X.iterrows()]
        else:
            raise TypeError('{} is not supported type'.format(type(X)))
        
        predictions_list = []
        for values in values_list:
            likelihood_priors = {}
            for class_val in df[self.c_n].unique():
                likelihood_priors[class_val] = self.prior(class_val)*self.likelihood_expr(class_val,values)
            print("likelihood priors : ", likelihood_priors)
            
            normalizing_prob = np.sum([x for x in likelihood_priors.values()])
            probabilities = [(y/normalizing_prob,x) for x,y in likelihood_priors.items()]
            print("Probability : ", probabilities)
            
            if len(probabilities) == 2:
                # For 2 Class Predictions
                max_prob = max(probabilities)[1]
                predictions_list.append(max_prob)
            
            else:
                # For Mulit Class Predictions
                exp_1 = [np.exp(x) for x,y in probabilities]
                exp_2 = np.sum(exp_1)
                softmax = exp_1/exp_2
                print(softmax)
                class_names = [y for x,y in probabilities]
                softmax_values = [(x,y) for x,y in zip(softmax,class_names)]
                print(softmax_values)
                max_prob = max(softmax_values)[1]
                predictions_list.append(max_prob)
        return predictions_list
    
    def accuracy_score(self, X, Y):
        assert len(X) == len(Y), 'Nilai yang diberikan tidak sama dalam ukuran'
        total_matching_values = [x == y for x,y in zip(X,Y)]
        return (np.sum(total_matching_values)/len(total_matching_values))*100
    
    def calculate_confusion_matrix(self, X, Y):
        df = self.df
        unique_class_values = df[self.c_n].unique()
        decimal_class_values = list(range(len(unique_class_values)))
        numerical = {x:y for x,y in zip(unique_class_values, decimal_class_values)}
        
        x = [numerical[x] for x in X]
        y = [numerical[y] for y in Y]
        
        n = len(decimal_class_values)
        confusion_matrix = np.zeros((n,n))
        for i,j in zip(x,y):
            if i == j:
                confusion_matrix[i][i] += 1
            elif i != j:
                confusion_matrix[i][j] += 1
        return confusion_matrix
            
    
    def precision_score(self, X, Y):
        assert len(X) == len(Y), 'Nilai yang diberikan tidak sama dalam ukuran'
        confusion_matrix = self.calculate_confusion_matrix(X,Y)
        tp = confusion_matrix[0][0]
        fp = confusion_matrix[1][0]
        return tp / (tp+fp)
    
    def recall_score(self, X, Y):
        assert len(X) == len(Y), 'Nilai yang diberikan tidak sama dalam ukuran'
        confusion_matrix = self.calculate_confusion_matrix(X,Y)
        tp = confusion_matrix[0][0]
        fn = confusion_matrix[0][1]
        return tp / (tp+fn)

In [None]:
data = pd.read_csv('tumor.csv')

In [None]:
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [None]:
ind = list(data.index)
np.random.shuffle(ind)

In [None]:
train_len = int(data.shape[0]*0.75)
train_ind = ind[:train_len]
training_data = data.iloc[train_ind,:]
test_ind = ind[train_len:]
testing_data = data.iloc[test_ind,:]

print('Training_data size -> {}'.format(training_data.shape))
print('Testing_data size -> {}'.format(testing_data.shape))

assert data.shape[0] ==  len(train_ind)+ len(test_ind), 'Not equal distribution'

Training_data size -> (426, 33)
Testing_data size -> (143, 33)


In [None]:
training_data.drop(columns=[data.columns[0],data.columns[-1]], inplace=True)
testing_data.drop(columns=[data.columns[0],data.columns[-1]], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
training_data.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
160,B,11.75,20.18,76.1,419.8,0.1089,0.1141,0.06843,0.03738,0.1993,...,13.32,26.21,88.91,543.9,0.1358,0.1892,0.1956,0.07909,0.3168,0.07987
225,B,14.34,13.47,92.51,641.2,0.09906,0.07624,0.05724,0.04603,0.2075,...,16.77,16.9,110.4,873.2,0.1297,0.1525,0.1632,0.1087,0.3062,0.06072
547,B,10.26,16.58,65.85,320.8,0.08877,0.08066,0.04358,0.02438,0.1669,...,10.83,22.04,71.08,357.4,0.1461,0.2246,0.1783,0.08333,0.2691,0.09479
495,B,14.87,20.21,96.12,680.9,0.09587,0.08345,0.06824,0.04951,0.1487,...,16.01,28.48,103.9,783.6,0.1216,0.1388,0.17,0.1017,0.2369,0.06599
528,B,13.94,13.17,90.31,594.2,0.1248,0.09755,0.101,0.06615,0.1976,...,14.62,15.38,94.52,653.3,0.1394,0.1364,0.1559,0.1015,0.216,0.07253


In [None]:
testing_data.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
387,B,13.88,16.16,88.37,596.6,0.07026,0.04831,0.02045,0.008507,0.1607,...,15.51,19.97,99.66,745.3,0.08484,0.1233,0.1091,0.04537,0.2542,0.06623
549,B,10.82,24.21,68.89,361.6,0.08192,0.06602,0.01548,0.00816,0.1976,...,13.03,31.45,83.9,505.6,0.1204,0.1633,0.06194,0.03264,0.3059,0.07626
386,B,12.21,14.09,78.78,462.0,0.08108,0.07823,0.06839,0.02534,0.1646,...,13.13,19.29,87.65,529.9,0.1026,0.2431,0.3076,0.0914,0.2677,0.08824
288,B,11.26,19.96,73.72,394.1,0.0802,0.1181,0.09274,0.05588,0.2595,...,11.86,22.33,78.27,437.6,0.1028,0.1843,0.1546,0.09314,0.2955,0.07009
49,B,13.49,22.3,86.91,561.0,0.08752,0.07698,0.04751,0.03384,0.1809,...,15.15,31.82,99.0,698.8,0.1162,0.1711,0.2282,0.1282,0.2871,0.06917


In [None]:
genx = NB(target='diagnosis',dataframe=training_data)

In [None]:
y_test = list(testing_data.iloc[:,0])
y_pred = genx.predict(testing_data.iloc[:,1:])

likelihood priors :  {'B': 3.4153316656236993e-06, 'M': 1.6992277672143714e-19}
Probability :  [(0.9999999999999503, 'B'), (4.97529356904779e-14, 'M')]
likelihood priors :  {'B': 5.289575873672117e-07, 'M': 2.2071470298147007e-18}
Probability :  [(0.9999999999958273, 'B'), (4.172635164931004e-12, 'M')]
likelihood priors :  {'B': 0.004647536581340481, 'M': 1.3889689734534075e-14}
Probability :  [(0.9999999999970113, 'B'), (2.9886133204972823e-12, 'M')]
likelihood priors :  {'B': 2.8716552999948023e-19, 'M': 1.3222608963203513e-23}
Probability :  [(0.9999539568688731, 'B'), (4.6043131126876934e-05, 'M')]
likelihood priors :  {'B': 0.00022960024862847712, 'M': 1.775049348576494e-11}
Probability :  [(0.999999922689578, 'B'), (7.731042200302399e-08, 'M')]
likelihood priors :  {'B': 0.0060800976809967985, 'M': 2.4248744217369927e-15}
Probability :  [(0.9999999999996011, 'B'), (3.988216224411843e-13, 'M')]
likelihood priors :  {'B': 0.010465745688964764, 'M': 1.0863546404198464e-14}
Probabili

In [None]:
print('Accuracy Score -> {} %'.format(round(genx.accuracy_score(y_test,y_pred),3)))
print('Precison Score -> {}'.format(round(genx.precision_score(y_test,y_pred),3)))
print('Recall Score -> {}'.format(round(genx.recall_score(y_test,y_pred),3)))

Accuracy Score -> 93.007 %
Precison Score -> 0.953
Recall Score -> 0.931
