Student Name : Huaiying Shao

UOW ID : 7910356

In [None]:
# To import relevant libraries
import numpy as np
import pandas as pd
import random
import sys
from random import randrange
from math import sqrt
from math import pi
from math import exp
from math import isnan

In [None]:
# Method to stratify sample the dataset into training and testing
def stratifySampleDataset(df,feature,trainSampleSize,randSeed):
  np.random.seed(randSeed) if randSeed != 0 else 0

  # Create feature groups0
  groups = df.groupby(feature)

  # Create two datasets - trainDataset and testDataset
  trainDataset = pd.DataFrame(columns = df.columns)
  testDataset =  pd.DataFrame(columns = df.columns)

  # Loop through each group and split into train and test datasets
  for groupName, groupData in groups:
      groupSize = len(groupData)
      trainGroupSize = int(trainSampleSize * groupSize)

      # Randomly sample indices for the train set
      trainIndices = np.random.choice(groupData.index, \
                                        size = trainGroupSize, \
                                        replace=False)

      # Create training and testing sets for the current group
      testGroup = groupData.drop(trainIndices)
      trainGroup = groupData.loc[trainIndices]

      # Concatenate the group sets to the overall sets
      trainDataset = pd.concat([trainDataset, trainGroup])
      testDataset = pd.concat([testDataset, testGroup])

  return trainDataset, testDataset


In [None]:
# Calculate the Gaussian probability distribution for continuous features
def calculateGaussianProbability(x, mean, stdev, totalRows):
    if stdev == 0 or isnan(stdev):
        return 1/totalRows # if stdev is 0, return the probability of "Add one count"

    exponent = exp(-((x - mean)**2 / (2 * stdev**2 )))

    if exponent == 0:
        # Set exponent to smallest possible float supported by the system
        exponent = sys.float_info.min

    return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [None]:
# Method to calculate probability for categorical features
def calculateProbability(x, X1, count1, X2, count2, classCount, totalRows):
    if x == X1:
        # If zero frequency occurs, add 1 to count and return the probability
        if count1/classCount == 0:
            return 1/totalRows

        return count1/classCount

    else:
        # If zero frequency occurs, add 1 to count and return the probability
        if count2/classCount == 0:
            return 1/totalRows

        return count2/classCount

In [None]:
# Method to calculate class probability
def calculateClassProbabilities(summaries, row):

    # Get the length of the dataset
    # Sum up all the counts of each label class
    totalRows = sum([summaries[label][0][2] for label in summaries])

    # Instantiate a dictionary to store probability of each label class for a given row
    probabilities = dict()

    # Get the class value: classValue
    # Get the summaries for each class: classSummaries
    for classValue, classSummaries in summaries.items():

        # Get the probability of each label class e.g., If class label 1 has a length of 12345
        # and length of dataset is 234567 then this probability is 12345/234567
        probabilities[classValue] = summaries[classValue][0][2]/float(totalRows)

        #looping through each summaries
        for i in range(len(classSummaries)):
            # if categorical feature
            if len(classSummaries[i]) > 3:
                X1, count1, X2, count2, classCount = classSummaries[i]
                probabilities[classValue] = \
                probabilities[classValue] * calculateProbability(row[i], X1, count1, X2, count2, classCount, totalRows)
            # if continuous feature
            else:
                mean, stdev, _ = classSummaries[i]
                probabilities[classValue] = \
                probabilities[classValue] * calculateGaussianProbability(row[i], mean, stdev, totalRows)

    return probabilities

In [None]:
# Method to make prediction
def predictClass(summaries, row):
    # Storing the probabilities by calling the method
    probabilities = calculateClassProbabilities(summaries, row)

    # Initializing the variables
    bestLabel, bestProb = None, -1

    # Looping through to find the best label with the best probability
    for classVal, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classVal
    return bestLabel

In [None]:
# Method to summarise the dataset
def summarizeDataset(data):
    # Creating a empty dict to store the target class
    summaries = {}
    # For loop to loop the target column that are unique
    for i in data.iloc[:,-1].unique():
        # listing the feature variable of each unique class
        feature = []
        # looping through all the feature variable except the target column
        for j in range(len(data.columns)-1):
            # Storing the size of unique values in a variable
            uniqueValueSize = len(data.iloc[:,j].unique())

            # If categorical feature
            if(uniqueValueSize < 5):
                aList = list()
                # Creating a subset for each class
                df = data[data.iloc[:,-1] == i]

                # looping through the categorical feature for unique vals
                for k in data.iloc[:,j].unique():
                    # storing the unique value and counting the val
                    aList.append(k)
                    aList.append(len(df[df.iloc[:,j] == k]))
                feature.append(tuple([aList[0], aList[1], aList[2], aList[3], len(data[data.iloc[:,-1] == i])]))
            # else continuous feature
            else:
                feature.append((data[data.iloc[:,-1] == i].mean(axis = 0)[j], \
                                data[data.iloc[:,-1] == i].std(axis = 0)[j], len(data[data.iloc[:,-1] == i])))
        # storing the summaries
        summaries[i] = feature
    return summaries

In [None]:
# Method to build the Naive Bayesian Model
def naiveBayesian(trainSet, testSet):
    # Method to get the summary
    summary = summarizeDataset(trainSet)

    # Creating an empty list
    predictions = list()

    #compare len(train) vs len(test)

    #for i in biggerset.index.tolist():
    #    predictclass(summary, row[i])

    # looping through the test set values
    for row in testSet.values:
        output = predictClass(summary, row)
        predictions.append(output)

    return(predictions)

In [None]:
# Calculated as:
# check for equality of predicted value and labels in test_set
# calculates the sum of correct prediction
# divides the sum by length of test_set

def calculate_accuracy(predictions, dataSet):
    yTest = list(dataSet.iloc[:,-1])
    correctCount = 0
    sumError = 0.0
    rsmeError = 0.0

    minLength = min(len(predictions),len(dataSet))


    for i in range(minLength):
        if predictions[i] == yTest[i]:
            correctCount += 1
        sumError += abs(predictions[i] - yTest[i])
        predictionError = abs(predictions[i] - yTest[i])
        rsmeError = (predictionError**2)

    print(f'Number of exact matches in predictions: {correctCount}/{len(yTest)}')
    print(f'Mean Squared Error(MSE): {np.square(np.subtract(yTest[:len(predictions)],predictions)).mean()}')
    print(f'Root Mean Squared Error (RMSE): {sqrt(rsmeError/float(len(yTest)))}')
    print(f'Mean Absolute Error(MSE): {sumError/float(len(yTest))}')

    return (round(correctCount/len(dataSet)*100,3))

# Preprocessing

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Read in the dataset
path = "/content/drive/MyDrive/dataset/iris.data"
df = pd.read_csv(path)

In [None]:
# Show the first 10 rows for understanding of dataset
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
# get the info about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [None]:
# Check for missing values
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
class           0
dtype: int64

## Naive Bayes algorithm works on numerical values, hence need to encode all categorical columns.

In [None]:
# Encode all categorical features
for key, value in dict(df.dtypes).items():
  if value == 'object':
    dummy = pd.Series(df[key], dtype = 'category')
    df[key] = dummy.cat.codes

df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,1.0
std,0.828066,0.433594,1.76442,0.763161,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    int8   
dtypes: float64(4), int8(1)
memory usage: 5.0 KB


(1) use stratified sampling to select ~70% for training and ~30% for test.

In [None]:
# Stratify the dataset into training and testSet
trainSize = 0.7
randSeed = 42
label = 'class'
trainSet, testSet = stratifySampleDataset(df, label, trainSize, randSeed)
trainSet.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,105.0,105.0,105.0,105.0
mean,5.873333,3.050476,3.785714,1.204762
std,0.862941,0.454068,1.782793,0.778853
min,4.3,2.0,1.1,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.2,1.3
75%,6.4,3.3,5.1,1.9
max,7.9,4.4,6.9,2.5


In [None]:
# Show the total number of records for each Class
print(trainSet.groupby(label).size())
print(testSet.groupby(label).size())

class
0    35
1    35
2    35
dtype: int64
class
0    15
1    15
2    15
dtype: int64


In [None]:
# An insight into the test data set to see if any
testSet.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,45.0,45.0,45.0,45.0
mean,5.773333,3.062222,3.695556,1.184444
std,0.7448,0.386293,1.738986,0.733595
min,4.4,2.3,1.0,0.2
25%,5.0,2.9,1.6,0.3
50%,5.8,3.0,4.5,1.4
75%,6.4,3.3,5.1,1.8
max,7.3,4.0,6.3,2.4


# Evaluate the accuracy

In [None]:
# accuracy for training set
# Test the model on training set
trainPred = naiveBayesian(trainSet, testSet)
print('Accuracy of prediction for training set:',calculate_accuracy(trainPred, trainSet))

Number of exact matches in predictions: 17/105
Mean Squared Error(MSE): 0.9555555555555556
Root Mean Squared Error (RMSE): 0.09759000729485333
Mean Absolute Error(MSE): 0.3142857142857143
Accuracy of prediction for training set: 16.19


In [None]:
# accuracy for testing set
# Test the model on testing set
testPred = naiveBayesian(trainSet, testSet)
print('Accuracy of prediction for testing set:', \
      calculate_accuracy(testPred, testSet))

Number of exact matches in predictions: 41/45
Mean Squared Error(MSE): 0.08888888888888889
Root Mean Squared Error (RMSE): 0.0
Mean Absolute Error(MSE): 0.08888888888888889
Accuracy of prediction for testing set: 91.111
