[View in Colaboratory](https://colab.research.google.com/github/prateek-01/ML-Algos-from-Scratch/blob/master/NaiveBayes.ipynb)

# We will run our Naive Baye's Implementation on Iris Dataset (after making it from continuous to labelled )

*Note - The implementation is for labelled features only

In [0]:
import numpy as np

In [0]:
def fit(X_train, Y_train):
#  Returns a dictionary containing toal count of all values of a particular feature of a particular class
  
  result = {}
  class_values = set(Y_train)
  for current_class in class_values:
    result[current_class] = {}
    result['total_data'] = len(Y_train)
    current_class_rows =  ( Y_train == current_class)
    X_train_current = X_train[current_class_rows]
    Y_train_current = Y_train[current_class_rows]
    
    num_features = X_train.shape[1]
    
    result[current_class]['total_count'] = len(Y_train_current)
    
    for j in range(1, num_features+1):
      
      result[current_class][j] = {}
      all_possible_values = set(X_train[:,j-1])
      for current_value in all_possible_values:
        result[current_class][j][current_value] = (X_train_current[:,j-1] == current_value).sum()
  
  return result
      
      
  

In [0]:
def probability(dictionary ,x , current_class):
  output = np.log(dictionary[current_class]['total_count']) - np.log(dictionary['total_data'])
  num_features = len(dictionary[current_class].keys()) -1
  for j in range(1, num_features+1):
    xj = x[j-1]
    count_current_class_with_value_xj = dictionary[current_class][j][xj] + 1
    count_current_class = dictionary[current_class]['total_count'] + len(dictionary[current_class][j].keys())
    current_xj_probability = np.log(count_current_class_with_value_xj) - np.log(count_current_class)
    output = output + current_xj_probability
    
  return output
    

In [0]:
def predictSinglePoint(dictionary, x):
  classes = dictionary.keys()
  best_p = -1000
  best_class = -1
  first_run = True
  for current_class in classes:
    
    if(current_class == 'total_data'):
      continue
    p_current_class = probability(dictionary, x, current_class)
    if(first_run or p_current_class > best_p):
      best_p = p_current_class
      best_class = current_class
      
    first_run = False
    
  return best_class  
  

In [0]:
def predict(dictionary,X_test):
  y_pred = []
  for x in X_test:
    x_class = predictSinglePoint(dictionary,x)
    y_pred.append(x_class)
  return y_pred

  

In [0]:
def makeLabelled(column):
  second_limit = column.mean()
  first_limit = 0.5*second_limit
  third_limit = 1.5*second_limit
  for i in range(0,len(column)):
    if (column[i] < first_limit):
      column[i] = 0
    elif (column[i] < second_limit):
      column[i] = 1
    elif (column[i] < third_limit):
      column[i] = 2
    else:
      column[i] = 3
    
  return column
      
 

In [0]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target

In [18]:
X.shape, Y.shape

((150, 4), (150,))

In [0]:
for i in range(0,X.shape[-1]):
  X[:,i] = makeLabelled(X[:,i])

In [20]:
X[:,3]  #All features converted from continuos to labelled 

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 2.,
       2., 2., 2., 2., 2., 2., 1., 2., 2., 1., 2., 1., 2., 2., 2., 2., 1.,
       2., 1., 3., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 1., 2., 3., 3.,
       3., 3., 3., 3., 2., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       2., 3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 3., 3., 3., 2., 2., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.])

In [0]:
from sklearn import model_selection
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X,Y,test_size = 0.25,random_state = 0)

In [0]:
dictionary = fit(X_train, Y_train)

In [24]:
set(Y_train), set(X_train[:,0]),set(X_train[:,1]),set(X_train[:,2]),set(X_train[:,3])  # 3 classes of target variable and 4 features 

({0, 1, 2}, {1.0, 2.0}, {1.0, 2.0}, {0.0, 1.0, 2.0, 3.0}, {0.0, 1.0, 2.0, 3.0})

In [25]:
dictionary[0]  # For class label '0' this is the distribution of labels for each of the four features

{1: {1.0: 37, 2.0: 0},
 2: {1.0: 6, 2.0: 31},
 3: {0.0: 37, 1.0: 0, 2.0: 0, 3.0: 0},
 4: {0.0: 36, 1.0: 1, 2.0: 0, 3.0: 0},
 'total_count': 37}

In [28]:
y_pred = predict(dictionary, X_test)
y_pred[:5]

[2, 1, 0, 2, 0]

In [27]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Y_test,y_pred))
print(confusion_matrix(Y_test,y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       0.94      1.00      0.97        16
          2       1.00      0.89      0.94         9

avg / total       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]
