In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
    #to split the dataset into training set an testing set
from sklearn import metrics
    #to calculate accuracy and precision of classification

In [2]:
df = pd.read_csv("car.csv")
df
    #reads data from file and creates a pandas dataframe

Unnamed: 0,buying,maint,doors,persons,lug-boot,safety,label
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [3]:
df.dtypes    #datatypes of the variables/features

buying      object
maint       object
doors       object
persons     object
lug-boot    object
safety      object
label       object
dtype: object

In [4]:
df = df.drop(df[df['label'] == 'good'].index, axis=0)
df = df.drop(df[df['label'] == 'vgood'].index, axis=0)
df.shape
    #removing outliers for binary classification

(1594, 7)

In [5]:
labels = []
for i in df["label"]:
    if i == 'unacc':
        labels.append(0)
    else: labels.append(1)
    #convert labels datatype into binary(0,1) format
    #this is done to easily calculate accuracy and precision

labels = pd.DataFrame(labels)
    #converting the numpy array to a dataframe

In [6]:
features = df.drop(["label"], axis=1)
features
    #dropping label column to make input argument for the splitting function

Unnamed: 0,buying,maint,doors,persons,lug-boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med
...,...,...,...,...,...,...
1716,low,low,5more,4,big,low
1719,low,low,5more,more,small,low
1720,low,low,5more,more,small,med
1722,low,low,5more,more,med,low


In [42]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.7, random_state=0)

train_features.reset_index(drop=True, inplace=True)
test_features.reset_index(drop=True, inplace=True)
train_labels.reset_index(drop=True, inplace=True)
test_labels.reset_index(drop=True, inplace=True)

    #splitting our dataset into testing and training
    #resetting index

In [43]:
train = pd.concat([train_features, train_labels], axis=1)
train

Unnamed: 0,buying,maint,doors,persons,lug-boot,safety,0
0,high,vhigh,3,2,small,med,0
1,med,high,4,more,small,high,1
2,high,high,3,more,small,low,0
3,med,med,3,more,big,low,0
4,low,high,4,2,big,low,0
...,...,...,...,...,...,...,...
473,high,low,2,2,big,med,0
474,high,low,4,more,big,med,1
475,med,low,4,2,med,low,0
476,high,high,2,more,small,med,0


In [44]:
n_train = train.shape[0]
n_train 
    #number of rows in testing dataset

478

In [45]:
class Naive_Bayes():
    def __init__(self):
        self.result =[]  #initialize result array
        
    def priors(self, train):
        newdf = train[0].value_counts()   #finds category counts
        pr=[]
        pr.append(newdf[0]/train.shape[0])
        pr.append(newdf[1]/train.shape[0])  #calculating prior probabilities from training set
        return pr
    
    def prob_data(self, train, y):
        post=1
        for key, values in y.iteritems():
            newy = y.loc[:, [key]]
            common_cols = list(set(train.columns) & set(newy.columns))
            newdf = pd.merge(train, newy, on=common_cols, how='inner')  #creates new database with common feature as the input data
            p = (newdf.shape[0])/train.shape[0]  #calculate probability
            post *= p
        return post
    
    def prob_0(self, train, y):
        post_0=1
        for key, values in y.iteritems():
            newy = y.loc[:, [key]]
            common_cols = list(set(train.columns) & set(newy.columns))
            newdf = pd.merge(train, newy, on=common_cols, how='inner')   #creates new database with common feature as the input data
            count = newdf[(newdf[0]==0)]   #finds trainig sets with 0 label
            p=count.shape[0]/train.shape[0]  #calculate probability
            post_0 *= p
        return post_0
    
    def prob_1(self, train, y):
        post_1=1
        for key, values in y.iteritems():
            newy = y.loc[:, [key]]
            common_cols = list(set(train.columns) & set(newy.columns))
            newdf = pd.merge(train, newy, on=common_cols, how='inner')   #creates new database with common feature as the input data
            count = newdf[(newdf[0]==1)]   #finds trainig sets with 1 label
            p=count.shape[0]/train.shape[0]  #calculate probability
            post_1 *= p
        return post_1
    
    def bayes(self, pr, post, post_0, post_1):
        bayes0 = pr[0]*post_0/post
        bayes1 = pr[1]*post_1/post  #bayes formula
        if (bayes0>bayes1):
            return 0
        else: return 1   #returns prediction with higher probability
        
    def predict(self, train, test_features):
        for i in range(test_features.shape[0]):
            y = test_features.iloc[[i]]
            y.reset_index(drop=True, inplace=True)  #iterating all rows in test dataset
            
            prior = self.priors(train)
            post = self.prob_data(train, y)
            post_0 = self.prob_0(train, y)
            post_1 = self.prob_1(train, y)
            
            pred = self.bayes(prior, post, post_0, post_1)
            self.result.append(pred)   #add prediction label to results list
        

In [46]:
naive_bayes = Naive_Bayes()  #initialize class

In [47]:
naive_bayes.predict(train, test_features)  #initialize method

In [48]:
predict_label = pd.DataFrame(naive_bayes.result)
predict_label   #converting output array to a dataframe

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
1111,0
1112,0
1113,0
1114,0


In [49]:
predict_label[0].value_counts()  #prediction

0    1116
Name: 0, dtype: int64

In [50]:
acc = metrics.accuracy_score(test_labels, predict_label)
acc  #accuracy measure = true cases/total cases

0.7598566308243727

In [51]:
prec = metrics.precision_score(test_labels, predict_label)
prec   #precision measure = true positive/ total positive

  _warn_prf(average, modifier, msg_start, len(result))


0.0