<a href="https://colab.research.google.com/github/sagar9926/MTech_Atificial_Intelligence/blob/main/ML1/ML_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Question 1 

## Dataset : Iris Dataset

Use sklearn library for loading iris dataset.

__Aim__: Classification using Naive Bayes classifier

```
● Apply Naive bayes classifier assuming all features are independent.
Do not use any predefined library for classification
Report overall accuracy, class wise accuracy, confusion matrix and ROC curve.
```

In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from functools import reduce

### Loading Dataset

In [173]:
iris = datasets.load_iris()
df_iris = pd.DataFrame(iris.data,columns = iris.feature_names)
df_iris['target'] = iris.target

In [174]:
X, y = df_iris.iloc[:, :-1], df_iris.iloc[:, -1]

# # split on train and test 0.7/0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=1, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(75, 4) (75,)
(75, 4) (75,)


In [175]:
df_iris.target.unique()

array([0, 1, 2])

### Creating a Naive Bayes class

In [189]:
class NaiveBayesClassifier:

  """
  Bayes Theorem : 

  P(y|X) = (P(X|y)*P(y))/P(X)

  where :
  X : Input Features
  y : Traget Variable
  P(y|X) = Posterior Probability
  P(X|y) = Liklihood
  P(y) = Prior
  P(X) = Evidence

  """

  def __init__(self,prior = None):

    self.liklihood = {}
    self.posteriors = []
    self.prediction = []
    self.mean = None
    self.variance = None 
    if prior :
      self.prior = prior
    else :
      self.prior = {} 
 
  def calculate_prior(self ,target ,classes):
    """
    This function calculates the prior probabilities
    """
    prior = {}
    for index in classes:
      prior[index] = (target == index).sum()/len(target) 
    return prior


  def feature_statistics(self , features , target):
    """
    This function calculates mean and variance for continuous features
    """
    var = lambda x : np.var(x)  
    mean = features.groupby(y_train).agg({'sepal length (cm)' : 'mean',	'sepal width (cm)': 'mean',	'petal length (cm)': 'mean',	'petal width (cm)': 'mean'}).reset_index()
    variance = features.groupby(y_train).agg({'sepal length (cm)' : var,	'sepal width (cm)': var,	'petal length (cm)': var,	'petal width (cm)': var}).reset_index()
    
    return mean , variance


  def gaussian_probability(self,test_feature,classes, mean, variance):
    liklihood_prob = {}
    for index in classes:
      mean_vector = np.array(mean[mean['target'] == index])[0][1:]
      variance_vector = np.array(variance[variance['target'] == index])[0][1:]
      mean_diff_square = (np.array(test_feature) - mean_vector)**2
      liklihood_prob[index] = (1/np.sqrt(2*np.pi*variance_vector)*np.exp(-1*0.5*mean_diff_square/variance_vector))
    return liklihood_prob


  def calculate_posterior(self,test_feature,classes):

    posteriors = {}
    self.liklihood = self.gaussian_probability(test_feature ,classes ,self.mean,self.variance)
    for index in classes :
      posteriors[index] = reduce(lambda x , y : x*y , self.liklihood[index]) * self.prior[index]
    return posteriors


  def fit(self, features, target):
    self.classes = target.unique()
    self.mean , self.variance = self.feature_statistics(features , target)
    
    if self.prior :
      self.prior = self.calculate_prior(target , self.classes)

  def predict(self,test_features):
    for i in range(len(test_features)):
      self.posteriors.append(self.calculate_posterior(test_features.iloc[i].values,self.classes))
      self.prediction.append(sorted(self.posteriors[i].items(),key = lambda x : x[1],reverse = True)[0][0])
    #return(self.prediction)

  def accuracy(self , target,names):

    overall_accuracy = sum(target == self.prediction) / len(target)
    print(f"Overall accuracy of data : {round(overall_accuracy*100,2)}%")

    data = pd.DataFrame({'Actual Class' : target.values,'Predicted Class':self.prediction})
    for class_ in target.unique():
      temp = data[data['Actual Class'] == class_]
      print(f"{names[class_]} accuracy of data : {round(sum(temp['Actual Class'] == temp['Predicted Class']) / len(temp)*100,2)}%")



### Model Training

In [190]:
model = NaiveBayesClassifier()
model.fit(X_train, y_train)

### Making predictions using trained model

In [191]:
model.predict(X_test) 

In [192]:
model.accuracy(y_test,iris.target_names)

Overall accuracy of data : 97.33%
virginica accuracy of data : 96.0%
setosa accuracy of data : 100.0%
versicolor accuracy of data : 96.0%


# Question 2 :

__Dataset__: Wine dataset (use sklearn library for loading the dataset)

__Aim__: Naive Bayes Classification
```
Shuffle the data with seed value 42 and perform a 70- 30 stratified split of the data into a train and test set.
Also, plot the class-wise distribution of data in the train and test set (one for train set and one for test set).

Compare the distributions. Now, perform classification as follows:

* Train a Gaussian Naive Bayes classifier and report (a) the class priors, (b) mean and variance of
each feature per class.

* Train another Gaussian Naive Bayes classifier by setting prior probability for the classes. Repeat this
experiment by setting priors in the ratios: (a) 40-40-20 and (b) 80-10-10.
```

In [202]:
wine = datasets.load_wine()
df_wine = pd.DataFrame(wine.data,columns = wine.feature_names)
df_wine['target'] = wine.target

In [203]:
X, y = df_wine.iloc[:, :-1], df_wine.iloc[:, -1]

# # split on train and test 0.7/0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=1, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(89, 13) (89,)
(89, 13) (89,)


In [204]:
df_wine.target.unique()

array([0, 1, 2])

### Model Training

In [205]:
model = NaiveBayesClassifier()
model.fit(X_train, y_train)

SpecificationError: ignored

### Making predictions using trained model

In [None]:
model.predict(X_test) 

In [None]:
model.accuracy(y_test,iris.target_names)

Overall accuracy of data : 97.33%
virginica accuracy of data : 96.0%
setosa accuracy of data : 100.0%
versicolor accuracy of data : 96.0%
