**Gaussian Naive Bayes Classification**

In [44]:
from csv import reader
from math import sqrt,pi,exp 

#read CSV file
def read_csv(filename):
  training_data=list()
  with open(filename,mode='r') as file:
    csv_reader = reader(file)
    for row in csv_reader:
      if not row:
        continue
      training_data.append(row)
  return training_data

#preprocess data - convert data points to
def preprocessing(training_dat):
  for row in training_dat:
    for i in range(len(training_dat[0])-1):
      #print(row[i])
      row[i]=float(row[i].strip())
  return training_dat 

#get_unique_classes- to find the unique label values or classes
def get_unique_classes(training_data):
  class_values = [row[-1] for row in training_data]
  unique = set(class_values)
  return unique

#get_classes- to create unique dictionary pair of classes & assign label value to numerical for easy computation
def get_classes(training_data,unique):
  classes=0
  class_set=list()
  class_def=dict()
  #print(unique)
  for i in unique:
    # print(f'{i}=>{classes}')
    class_set.append(classes)
    for row in training_data:
      if row[-1]== i :
        row[-1]= classes
    class_def[classes]=i
    classes+=1
  return training_data,class_set,class_def

#divide classes w.r.t label or output value 
def divide_classes(training_dat,class_values,printFlag):
  classes=dict()
  #print(values)
  for i in class_values:
    #print(i)
    df=list()
    for row in training_dat:
      if row[-1]== i :
        df.append(row)
    #df=training_dat[training_dat['Gender']== i]
    #print(df)  	 
    class_list=df
    #print(class_list)
    classes[i]=class_list
  if(printFlag==True):
    print(f'Class Division based on Label : {classes}')
  return classes

# to compute mean of the column data
def mean(column_data):
  return sum(column_data)/len(column_data)

# to compute standard deviation of the column data
def stdev(column_data):
  avg = mean(column_data)
  variance = sum([(i-avg)**2 for i in column_data]) / float(len(column_data)-1)
  return sqrt(variance)
# compute mean standard deviation of each column/parameter w.r.t label/class
def mean_std_of_classes_columns(training_data,printFlag):
  classes=dict()
  for class_value, rows in training_data.items():
    classes[class_value] = [(mean(column), stdev(column)) for column in zip(*rows)]
    del(classes[class_value][-1])
  if(printFlag==True):
    print('Mean,std deviation of columns w.rt classes',classes)
  return classes

#Find the probability of feature of each feature by class
def probability_of_eachfeature_byclass(x,mean,stddev):
  return (1 / (sqrt(2 * pi) * stddev)) * exp(-((x-mean)**2 / (2 * stddev**2 )))

#calculate the probabilites of all features by  class
def calculate_probabilities_feature(mean_std_data,test_data,class_def,printFlag):
  probabilities=dict()
  if(printFlag==True):
    print('Probabilites classes given feature(P(Xi/C)) is given below - 0,1,2 corresponds to Height,Weight,Age')
  for item, list_of_values in mean_std_data.items():
    pro=1
    for i in range(len(list_of_values)):
      probaility=1
      mean = mean_std_data[item][i][0]
      stddev=mean_std_data[item][i][1]
      #print(mean,stddev)
      x=test_data[i]
      probability=probability_of_eachfeature_byclass(x,mean,stddev)
      if(printFlag==True):
        print(f'probability[{i}/{class_def[item]}] = {probability}')
      pro*=probability
      #print(pro)
    probabilities[item]=pro
  return probabilities

#multiply probability of class with probability of all probabilites of feature by class
def calculate_class_probability(class_data,mean_std_data,test_data,class_def,n,printFlag):
  probabilites=dict()
  #print(class_data)
  probability_of_allfeatures=calculate_probabilities_feature(mean_std_data,test_data,class_def,printFlag)
  for item,values in class_data.items():
     probabilites[item]=len(values)/n
     #print(probabilites[item],probability_of_allfeatures[item])
     probabilites[item]*=probability_of_allfeatures[item]
     probability=probabilites[item]
     if(printFlag==True): #avoid printing for Leavout call
      print(f'P(Class={class_def[item]}/X)= {probability}')
  #print(probabilites)
  return probabilites

#classifying the gender based on the max value of probabilities
def predict_gender(training_data,test_data,class_def,printFlag):
  training_data,classes,_=get_classes(training_data,get_unique_classes(training_data))
  class_data=divide_classes(training_data,classes,printFlag)
  mean_std_data=mean_std_of_classes_columns(class_data,printFlag)
  probabilities=calculate_class_probability(class_data,mean_std_data,test_data,class_def,len(training_data),printFlag)
  max_key = max(probabilities, key=probabilities.get)
  return max_key

#leave- one out algorithm to find accuracy and error
def leave_out(training_dat,class_def):
  leaveout=[]
  correct=0
  counter=0
  train=training_dat.copy()
  accuracy=[]
  error=[]
  incorrect=0
  #print(training_dat)
  for i in training_dat:
    printflag=False
    leaveout=train.copy()
    leaveout.remove(i)
    #print(len(leaveout))
    counter+=1
    predicted=predict_gender(leaveout,i,class_def,printflag)
    actual=i[-1]
    #print(f'Predicted :{predicted} , actual {actual},{i}')
    if(predicted==actual):
      #print(f'Predicted :{predicted} , actual {actual},{i}')
      correct+=1
    else:
      incorrect+=1
  accurate=(correct/counter)*100
  erroneous=(incorrect/counter)*100
  accuracy.append(accurate)
  error.append(erroneous)
  #print(correct,counter)
  print('Accuracy is',accuracy)
  print('Percentage of error  is:',error)


 **a) Learn/derive the parameters for the Gaussian Na¨ıve Bayes Classifier for the data from Question 2
a) and apply them to the same target as in problem 2a). Show your intermediate steps**


height->0
weight->1
age->2


p(height|W), p(height|M),
p(weight|W), p(weight|M), p(age|W), p(age|M).

In [47]:
printFlag=True
training_data=read_csv('sample_data/2a.csv')
training_data=preprocessing(training_data)
# print('Get Test Data point')
test_data=test_data=read_csv('sample_data/2a-test_data.csv') #testing data csv file w.rt path
for row in test_data:
  row[-1]=float(row[-1])
test_data=preprocessing(test_data)
# test_data=preprocessing(test_data)
# test_data.append(float(input('Enter Height: ').strip()))
# test_data.append(float(input('Enter weight: ').strip()))
# test_data.append(float(input('Age: ').strip()))
#test_data = [1.885800464122,78.598356440796,29]
_,_,class_def=get_classes(training_data,get_unique_classes(training_data))
print('classes : ',class_def)
for i in test_data:
  print(f'---------for Test Data point {i}-------------------')
  printFlag=True
  predicted_key=predict_gender(training_data,i,class_def,printFlag)
  print(f'Predicted Gender of {i} is {class_def[predicted_key]}')
leave_out(training_data,class_def)

classes :  {0: 'W', 1: 'M'}
---------for Test Data point [1.816359375, 76.2084063, 41.0]-------------------
Class Division based on Label : {0: [[1.701405128, 65.56667967, 30.0, 0], [1.563862003, 65.45241222, 23.0, 0], [1.859291877, 65.00842059, 32.0, 0], [1.589276535, 59.92227629, 32.0, 0], [1.836275839, 75.01700292, 31.0, 0], [1.618344662, 65.82794895, 30.0, 0], [1.503678782, 55.42614147, 27.0, 0]], 1: [[1.720534988, 76.55007232, 24.0, 1], [1.82200208, 75.86436618, 30.0, 1], [1.798131168, 77.15315473, 32.0, 1], [1.682933208, 72.68980647, 34.0, 1], [1.696417484, 77.60294632, 33.0, 1], [1.608517593, 71.5512203, 32.0, 1], [1.781264348, 75.88996367, 29.0, 1]]}
Mean,std deviation of columns w.rt classes {0: [(1.6674478322857145, 0.13695031348839415), (64.60298315857143, 6.026198485993357), (29.285714285714285, 3.251373336211726)], 1: [(1.7299715527142858, 0.07519407826998374), (75.32878999857144, 2.303173491202322), (30.571428571428573, 3.359421718944242)]}
Probabilites classes given feat

**3b & 3c Implement Naive Bayes on the  training data(program data) by taking test data input and calculate accuracy and error percentage with leave out algorithm**


In [49]:
printFlag=True
training_data=read_csv('sample_data/2b.csv')
training_data=preprocessing(training_data)
print('Training Data is ',training_data)
print('Plese enter test data point')
test_data=[]
test_data.append(float(input('Enter Height: ').strip()))
test_data.append(float(input('Enter weight: ').strip()))
test_data.append(float(input('Age: ').strip()))
_,_,class_def=get_classes(training_data,get_unique_classes(training_data))
print(class_def)
print(f'---------for Test Data point {test_data}-------------------')
printFlag=True
predicted_key=predict_gender(training_data,test_data,class_def,printFlag)
print(f'Predicted Gender of {i} is {class_def[predicted_key]}')
leave_out(training_data,class_def)

Training Data is  [[1.7983977708734, 72.545353881384, 29.0, 'W'], [1.7917466351447, 78.826072501737, 30.0, 'W'], [1.7682857142857, 69.288868987849, 28.0, 'W'], [1.7949802099235, 88.006698609763, 22.0, 'M'], [1.7300179353714, 71.812932935206, 29.0, 'W'], [1.6790722684193, 78.018326460062, 23.0, 'W'], [1.8059902565001, 74.648125986057, 22.0, 'W'], [1.8453280504394, 75.387886361367, 26.0, 'W'], [1.7574613382704, 82.834433978664, 25.0, 'M'], [1.7691728901745, 74.981845908563, 31.0, 'W'], [1.7694089716, 72.251754861586, 29.0, 'M'], [1.9119015896775, 90.243374848249, 26.0, 'M'], [1.8338252737861, 82.501982818479, 38.0, 'M'], [1.8036472381727, 64.543877592792, 20.0, 'W'], [1.814394092261, 79.683114468035, 32.0, 'W'], [1.803350567802, 84.351853926674, 18.0, 'M'], [1.7840150867228, 70.873064650935, 27.0, 'W'], [1.885800464122, 78.598356440796, 29.0, 'W'], [1.9448657846608, 89.034513862798, 24.0, 'M'], [1.8103438565626, 83.303780497732, 19.0, 'M'], [1.869850948875, 79.110670324195, 26.0, 'M'], [

3d . Repeat the experiment in part 2 c) and 2 d) with the Gaussian Naive Bayes Classifier. Discuss the
results, in particular with respect to the performance difference between using all features and using
only height and weight

In [50]:
temp=[]
for row in training_data:
  temp.append([row[0],row[1],row[3]])
print('Training data by removed age column/parameter',temp)
print('Leave out algorithm implementation after removing parameter')
leave_out(temp,class_def)


Training data by removed age column/parameter [[1.7983977708734, 72.545353881384, 0], [1.7917466351447, 78.826072501737, 0], [1.7682857142857, 69.288868987849, 0], [1.7949802099235, 88.006698609763, 1], [1.7300179353714, 71.812932935206, 0], [1.6790722684193, 78.018326460062, 0], [1.8059902565001, 74.648125986057, 0], [1.8453280504394, 75.387886361367, 0], [1.7574613382704, 82.834433978664, 1], [1.7691728901745, 74.981845908563, 0], [1.7694089716, 72.251754861586, 1], [1.9119015896775, 90.243374848249, 1], [1.8338252737861, 82.501982818479, 1], [1.8036472381727, 64.543877592792, 0], [1.814394092261, 79.683114468035, 0], [1.803350567802, 84.351853926674, 1], [1.7840150867228, 70.873064650935, 0], [1.885800464122, 78.598356440796, 0], [1.9448657846608, 89.034513862798, 1], [1.8103438565626, 83.303780497732, 1], [1.869850948875, 79.110670324195, 1], [1.7651929874261, 70.846185421099, 0], [1.7557881533871, 62.785650413505, 0], [1.9152250788251, 73.576126108931, 1], [1.8804003551891, 78.215