In [111]:
import torch
import math
from statistics import NormalDist
import numpy as np

In [20]:
def train_test_split(X,y,split_ratio):
  # select random indices
  indices = torch.randperm(X.size(0))

  train_N = int(split_ratio * X.size(0))

  train_indices = indices[:train_N]
  test_indices = indices[train_N:]

  X_train = X[train_indices]
  y_train = y[train_indices]

  X_test = X[test_indices]
  y_test = y[test_indices]

  return X_train,y_train,X_test,y_test

In [5]:
# class NaiveBayes:
#     def __init__(self):
          # self.__N
          # self.__K
          # self.__d
          # self.__prior_probs
          # self.__measures
          # self.__con_indices
          # self.__cat_indices


In [6]:
def one_hot_encoder(y_train):
  N = y_train.size(0)                       # number of examples
  K = torch.unique(y_train).numel()         # number of classes

  y_train_OHE = torch.zeros(N,K)            # one_hot_ecoded y_train if size N,K

  for i,target in enumerate(y_train):
    y_train_OHE[i,target] = 1

  return y_train_OHE

In [7]:
def cal_prior_prob(one_hot_y):
  N = one_hot_y.size(0)
  K = one_hot_y.size(1)
  prior_probs = torch.zeros(K)

  for i in range(K):
     prior_probs[i] = (one_hot_y[:,i].sum()).item()/N

  return prior_probs

In [92]:
def get_con_cat_indices(X_train):
  d = X_train.size(1)
  con_indices = torch.empty(0,dtype = torch.int64)                                                                  # tensor to store indices of continous features
  cat_indices = torch.empty(0,dtype = torch.int64)                                                                  # tensor to store indices of categorical features

  for i in range(d):
    if(X_train[:,i].dtype in (torch.float32, torch.float64, torch.int8, torch.int16, torch.int32, torch.int64)):    # if ith column is continous
      con_indices = torch.cat((con_indices,torch.tensor([i],dtype = torch.int64)),0)                                # add i to con_indices
    else:
      cat_indices = torch.cat((cat_indices,torch.tensor([i],dtype = torch.int64)),0)

  return con_indices,cat_indices

In [91]:
def cal_central_measures(X_train,y_train_encoded,con_indices):

  central_measures = torch.zeros(2,con_indices.size(0))

  class_indices = torch.where(y_train_encoded == 1)[0]        # only select indices for the class i.e 1
  class_data = X_train[class_indices,:]                       # select only rows having those indices

  # calculate mean and std deviation of particular class for each feature
  for _,i in enumerate(con_indices):
    central_measures[0,i] = class_data[:,i].mean()
    central_measures[1,i] = class_data[:,i].std()

  return central_measures


In [105]:
def train(X_train,y_train):
  N = y_train.size(0)                       # number of examples
  classes = torch.unique(y_train)           # unique classes in y_train
  K = torch.unique(y_train).numel()         # number of classes

  y_encode  = one_hot_encoder(y_train)      # convert y_train to one hot encoding

  con_indices,cat_indices = get_con_cat_indices(X_train)
  measures = {}
  for i,c in enumerate(classes):
    measures[c.item()] = []
    measures[c.item()].append(cal_central_measures(X_train,y_encode[:,c],con_indices))


  return measures                         # store this as data member (latter)


In [131]:
def predict(X_test,measures):
  con_indices,cat_indices = get_con_cat_indices(X_test)
  likely_hood = torch.zeros(len(measures),dtype=torch.float64)
  for key in measures:
    for i in range(con_indices.size(0)):
        mean = measures[key][0][0,i]
        std = measures[key][0][1,i]
        print(NormalDist(mu=mean,sigma = std).pdf(X_test[1,1]))
        likely_hood[key] = likely_hood[key]*NormalDist(mu=mean,sigma = std).pdf(X_test[1,1])
        print(likely_hood)


In [9]:
from sklearn.datasets import load_iris
iris = load_iris()
X = torch.from_numpy(iris.data)
y = torch.from_numpy(iris.target)

In [67]:
X_train,y_train,X_test,y_test = train_test_split(X,y,split_ratio=0.8)

In [132]:
measures = train(X_train,y_train)
predict(X_test,measures)
measures[0][0][:,0]
# print(get_con_cat_indices(X_train))

1.66408437123977e-09
tensor([0., 0., 0.], dtype=torch.float64)
0.32633771071715834
tensor([0., 0., 0.], dtype=torch.float64)
6.48748780161734e-16
tensor([0., 0., 0.], dtype=torch.float64)
1.3205050666794725e-120
tensor([0., 0., 0.], dtype=torch.float64)
1.6045734965206583e-10
tensor([0., 0., 0.], dtype=torch.float64)
1.4594950568942828
tensor([0., 0., 0.], dtype=torch.float64)
0.003984489699844291
tensor([0., 0., 0.], dtype=torch.float64)
2.964540344621994e-14
tensor([0., 0., 0.], dtype=torch.float64)
1.214310279509541e-07
tensor([0., 0., 0.], dtype=torch.float64)
1.0077048457941016
tensor([0., 0., 0.], dtype=torch.float64)
8.313724086150534e-06
tensor([0., 0., 0.], dtype=torch.float64)
0.03886204998595281
tensor([0., 0., 0.], dtype=torch.float64)


tensor([4.9850, 0.3424])

In [None]:
# convert predicted classes to one hot ecoding
# define distribution function
# calculate mean and std for quantitative predictors and probability for qualitative predictors
# multiply the probabilities for each predictor with individual class probability
# assign the class with max prob

In [None]:
NormalDist(mu=45,sigma = 5).pdf(30)

0.0008863696823876014