In [9]:
def separate_by_class(dataset):
  separated = dict()
  for i in range(len(dataset)):
    vector = dataset[i]
    class_value = vector[-1]
    if (class_value not in separated):
      separated[class_value] = list()
    separated[class_value].append(vector)
  return separated


In [17]:
separate_by_class(dataset)

{0: [[3.39, 2.33, 0],
  [3.11, 1.78, 0],
  [1.34, 3.37, 0],
  [3.58, 4.68, 0],
  [2.28, 2.87, 0]],
 1: [[7.42, 4.7, 1],
  [5.74, 3.53, 1],
  [9.17, 2.51, 1],
  [7.79, 3.42, 1],
  [7.94, 0.79, 1]]}

In [10]:
dataset = [[3.39,2.33,0],[3.11,1.78,0],[1.34,3.37,0],[3.58,4.68,0],[2.28,2.87,0],[7.42,4.70,1],[5.74,3.53,1],[9.17,2.51,1],[7.79,3.42,1],[7.94,0.79,1]]


In [11]:
def mean(numbers):
  return sum(numbers)/float(len(numbers))
from math import sqrt
def stdev(numbers):
  avg=mean(numbers)
  variance = sum([(x-avg)**2 for x in numbers])/float(len(numbers)-1)
  return sqrt(variance)
def summarize_dataset(dataset):
  summarizes = [(mean(column),stdev(column),len(column)) for column in zip(*dataset)]
  del(summarizes[-1])
  return summarizes




In [12]:
summary = summarize_dataset(dataset)
print(summary)

[(5.176, 2.7665750988863067, 10), (2.998, 1.219515386445689, 10)]


In [14]:
def summarize_by_class(dataset):
  separated = separate_by_class(dataset)
  summaries = dict()
  for class_value, rows in separated.items():
    summaries[class_value] = summarize_dataset(rows)
  return summaries

In [16]:
summary = summarize_by_class(dataset)
for label in summary:
  print(label)
  for row in summary[label]:
    print(row)

0
(2.7399999999999998, 0.9269034469673743, 5)
(3.0060000000000002, 1.1083005007668272, 5)
1
(7.611999999999999, 1.2357062757791595, 5)
(2.9899999999999998, 1.4552491195668185, 5)


In [19]:
from math import exp
from math import pi
def calculate_probability(x,mean,stdev):
  exponent = exp(-((x-mean)**2/(2*stdev**2)))
  return (1/(sqrt(2*pi)*stdev))*exponent

In [20]:
print(calculate_probability(1.,1.,1.))
print(calculate_probability(2.,1.,1.))
print(calculate_probability(0.,1.,1.))

0.3989422804014327
0.24197072451914337
0.24197072451914337


In [21]:
def calculate_class_probabilities(summaries,row):
  total_rows = sum([summaries[label][0][2] for label in summaries])
  probabilities = dict()
  for class_value, class_summaries in summaries.items():
    probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
    for i in range(len(class_summaries)):
      mean,stdev,count = class_summaries[i]
      probabilities[class_value] *= calculate_probability(row[i],mean,stdev)
  return probabilities

In [23]:
summaries = summarize_by_class(dataset)
probabilities = calculate_class_probabilities(summaries,dataset[0])
print(probabilities)

{0: 0.05029528969768076, 1: 0.00011651338112198482}


In [27]:
from random import seed
from random import randrange
from csv import reader
from math import sqrt

def load_csv(filename):
  dataset = list()
  with open(filename,'r') as file:
    csv_reader = reader(file)
    next(csv_reader)
    for row in csv_reader:
      if not row:
        continue
      dataset.append(row[0].split(';'))
  return dataset
def accuracy_metric(actual,predicted):
  correct=0
  for i in range(len(actual)):
     if actual[i] == predicted[i]:
      correct +=1
  return correct/float(len(actual)) * 100.0
def str_column_to_int(dataset,column):
  class_values = [row[column] for row in dataset]
  unique = set(class_values)
  lookup = dict()
  for i, value in enumerate(unique):
    lookup[value] = i
  for row in dataset:
    row[column] = lookup[row[column]]
  return lookup
def str_column_to_float(dataset,column):
  for row in dataset:
    row[column]=float(row[column].strip())
def cross_validation_split(dataset,n_folds):
  dataset_split = list()
  dataset_copy = list(dataset)
  fold_size = int(len(dataset)/n_folds)
  for _ in range(n_folds):
    fold = list()
    while len(fold) < fold_size:
      index = randrange(len(dataset_copy))
      fold.append(dataset_copy.pop(index))
    dataset_split.append(fold)
  return dataset_split
def evaluate_algorithm(dataset,algorithm,n_folds,*args):
  folds = cross_validation_split(dataset,n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set,[])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = algorithm(train_set,test_set,*args)
    actual = [row[-1] for row in fold]
    accur = accuracy_metric(actual,predicted)
    scores.append(accur)
  return scores

In [28]:
def predict(summaries,row):
  probabilities = calculate_class_probabilities(summaries,row)
  best_label,best_prob = None,-1
  for class_value,probability in probabilities.items():
    if best_label is None or probability > best_prob:
      best_prob=probability
      best_label = class_value
  return best_label


def naive_bayes(train,test):
  summarize = summarize_by_class(train)
  predictions = list()
  for row in test:
    output = predict(summarize,row)
    predictions.append(output)
  return (predictions)

In [55]:
from sklearn import datasets
import numpy as np
iris = datasets.load_iris()
data=np.concatenate([iris.data,np.expand_dims(iris.target,1)],axis=1)
data = data.tolist()

In [56]:

seed(1)
filename = "/content/drive/MyDrive/ML_mastery/winequality-white.csv"
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
  str_column_to_float(dataset,i)
str_column_to_int(dataset,len(dataset[0])-1)

n_folds = 5

scores = evaluate_algorithm(data,naive_bayes,n_folds)

print("Scores = %s"%scores)
print("Mean Accuracy : %.3f"%(sum(scores)/float(len(scores))))

Scores = [93.33333333333333, 96.66666666666667, 100.0, 93.33333333333333, 93.33333333333333]
Mean Accuracy : 95.333
