<a href="https://colab.research.google.com/github/saketh1999/Minimum-Risk-Bayes-Theoretic-classifier/blob/main/Minimum_Risk_Bayes_Theoretic_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DISCRIMINANT ANALYSIS

In this coding assignment you are to implement a Minimum Risk Bayes Decision Theoretic classifier. Use the training set to train the classifier and the validation set to evaluate the accuracy.

Assume the following:
1. All conditional density functions are multivariate Gaussian
2. Each class has its own covariance matrix
3. Equal prior probabilities
4. 0-1 loss function


## Load datasets

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load training data - 135 observations, 4 features, 3 classes,
df = pd.read_csv("iris_corrupted_training_dataset.csv")
print(df.head())
df = df.values
train_data = df

# Load validation data - 15 samples
df = pd.read_csv("iris_validation_dataset.csv")
print(df.head())
df = df.values
val_data = df

   sepal_length   sepal_width   petal_length   petal_width   class
0        5.7147        2.6743         3.2696       1.65440       2
1        5.1734        3.7374         5.9442       3.00050       3
2        7.3776        3.1505         3.3543       0.64839       2
3        6.4908        2.3983         3.3917       1.54950       2
4        6.8182        3.4016         4.7495       0.57970       3
   sepal_length   sepal_width   petal_length   petal_width   class
0           4.4           2.9            1.4           0.2       1
1           6.7           3.0            5.2           2.3       3
2           4.9           3.1            1.5           0.2       1
3           5.1           2.5            3.0           1.1       2
4           6.1           3.0            4.6           1.4       2


In [None]:
## Your code goes here ...
# Modifying the training data for easy calculation of mean and covariance

train_data1 = train_data[np.where(train_data[:,4]==1),0:4]
train_data2 = train_data[np.where(train_data[:,4]==2),0:4]
train_data3 = train_data[np.where(train_data[:,4]==3),0:4]
[t1d_i,t1d_j,t1d_k]=np.shape(train_data1)

train_data1 = train_data1.reshape(t1d_j,t1d_k)
train_data2 = train_data2.reshape(t1d_j,t1d_k)
train_data3 = train_data3.reshape(t1d_j,t1d_k)

#  u1, u2, u3 are the 1x4 mean vectors for train_data1, train_data2, train_data3 matrices
u1 = np.mean(train_data1, axis=0, keepdims=True)
u2 = np.mean(train_data2, axis=0, keepdims=True)
u3 = np.mean(train_data3, axis=0, keepdims=True)
print("--------------------------------------------------------------------------------------------------------------")
print("Mean of each feature with respect to each class")
print(u1.shape)
print(u1)
print(u2.shape)
print(u2)
print(u3.shape)
print(u3)
print("--------------------------------------------------------------------------------------------------------------")

#  dimension cov1, cov2, cov3 must be 4x4
cov1 = np.cov(np.transpose(train_data1))
cov2 = np.cov(np.transpose(train_data2))
cov3 = np.cov(np.transpose(train_data3))
print("Covariance matrix of each class")
print(cov1.shape)
print(cov1)
print(cov2.shape)
print(cov2)
print(cov3.shape)
print(cov3)
print("--------------------------------------------------------------------------------------------------------------")

# Compute the determinant of cov* and its log. These are scalar quantities
det_cov1 = np.linalg.det(cov1)
det_cov2 = np.linalg.det(cov2)
det_cov3 = np.linalg.det(cov3)

log_det_cov1 = np.log(det_cov1)
log_det_cov2 = np.log(det_cov2)
log_det_cov3 = np.log(det_cov3)
print("Log Determinants of covariances")
print(log_det_cov1)
print(log_det_cov2)
print(log_det_cov3)
print("--------------------------------------------------------------------------------------------------------------")

inv_cov1 = np.linalg.inv(cov1)
inv_cov2 = np.linalg.inv(cov2)
inv_cov3 = np.linalg.inv(cov3)
print("Inverse of each covariance matrix")
print(inv_cov1)
print(inv_cov2)
print(inv_cov3)
print("--------------------------------------------------------------------------------------------------------------")

# Equally likely proir prob.
log_prior = np.log(1/3)

--------------------------------------------------------------------------------------------------------------
Mean of each feature with respect to each class
(1, 4)
[[4.80081778 3.48799556 1.26920989 0.34787733]]
(1, 4)
[[6.06588222 2.82287978 4.26241333 1.10785197]]
(1, 4)
[[6.42966    2.95656956 5.55874667 1.92476547]]
--------------------------------------------------------------------------------------------------------------
Covariance matrix of each class
(4, 4)
[[ 0.73847372 -0.09788292  0.162097    0.09430334]
 [-0.09788292  1.04517177  0.08250472  0.06122466]
 [ 0.162097    0.08250472  0.75386746  0.07747734]
 [ 0.09430334  0.06122466  0.07747734  0.51347455]]
(4, 4)
[[ 1.02666705  0.16051089  0.28736137 -0.10850815]
 [ 0.16051089  0.80414317  0.20221368 -0.07318826]
 [ 0.28736137  0.20221368  0.74048204 -0.04380217]
 [-0.10850815 -0.07318826 -0.04380217  0.69674064]]
(4, 4)
[[1.36272732 0.26608677 0.44568822 0.30336696]
 [0.26608677 1.03934606 0.12853287 0.18437967]
 [0.4456

In [None]:
# Function to find the discriminant function values

def discriminant(val_data_x, mean_vector, inv_cov, log_det, prior):
  result=0
  sub_term = val_data_x - mean_vector
  term1 = (1/2)*np.matmul((sub_term),np.matmul(inv_cov,np.transpose(sub_term)))
  term2 = log_det/2
  result = prior - term1 - term2
  return result[0][0]

In [None]:
# Evaluate the model accuracy with the validation dataset
# The dimension of the validation dataset, val_data, is 15x5. The first four
# columns are the feature columns and the last column is the class label column

correct_class = 0;  # number of correctly predicted label
y_hat=[]
for i in range(0, len(val_data)):

    x = val_data[i,0:4]
    y = val_data[i,4]

    # Caculating g1, g2, g3
    g1=discriminant(x,u1,inv_cov1,log_det_cov1,log_prior)
    g2=discriminant(x,u2,inv_cov2,log_det_cov2,log_prior)
    g3=discriminant(x,u3,inv_cov3,log_det_cov3,log_prior)

    #  Now find the predicted class y_hat, compare it with the true label y
    #  and count the number of corectly predicted labels (correct_class)
    if g1>g2:
        if g1>g3:
            y_hat.append(1)
        else:
            y_hat.append(3)
    else:
        if g2>g3:
            y_hat.append(2)
        else:
            y_hat.append(3)
    if (y_hat[i] == y):
        correct_class = correct_class + 1

print('Classification accuracy = ', '{0:.4f}'. format(correct_class/15))

Classification accuracy =  0.9333
