<h1>CSE-6363 Machine Learning, Fall 2022<br>Programming Assignment 1<h1>


In [1]:
#importing libraries
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
import numpy as np
import pandas as pd

from csv import reader

In [2]:
# Accuracy between predicted and actual values

def accuracy(act, predicted):
    correct = 0
    temp = []
    for j in range(len(act)):
        if act[j] == predicted[j]:
            correct += 1
        temp.append((act[j]-predicted[j]) ^ 2)
    acc = correct / float(len(act)) * 100.0
    return [acc, sum(temp)]

In [3]:
# Calculate the mean of numbers


def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [4]:
# Calculate the standard deviation of numbers


def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

In [5]:
# Calculating the mean, standard dev and count for every column in a dataset


def dataset_summarization(dataset):
    summary = [(mean(col), stdev(col), len(col)) for col in zip(*dataset)]
    del(summary[-1])
    return summary

In [6]:
# Separating the class values 


def sep_class(dataset):
    sep = dict()
    for i in range(len(dataset)):
        class_attr = dataset[i]
        class_val = class_attr[-1]
        if (class_val not in sep):
            sep[class_val] = list()
        sep[class_val].append(class_attr)

    return sep

In [7]:
# Separating the dataset by class and finding statistical values for each row


def summarize_by_class(dataset):
    sep = sep_class(dataset)
    summaries = dict()
    for class_val, row in sep.items():
        summaries[class_val] = dataset_summarization(row)
    return summaries

In [8]:
# Gaussian probability distribution function for Calculating x


def gaussian_pdf(x, mean, stdev):
    if stdev == 0:
        exponent = exp(-(x-mean)**2)
        return (1 / (sqrt(2 * pi))) * exponent
    else:
        exponent = exp(-(x-mean)**2 / (2.0 * stdev**2))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [9]:
# Calculating the probabilities for predicting each class for a given row


def class_prob(summaries, row):
    tot_rows = sum([summaries[l][0][2] for l in summaries])
    prob = dict()
    for class_val, class_summ in summaries.items():
        prob[class_val] = summaries[class_val][0][2] / float(tot_rows)
        for i in range(len(class_summ)):
            mean, stdev, _ = class_summ[i]
            prob[class_val] *= gaussian_pdf(row[i], mean, stdev)

    return prob

In [10]:
# predicting  the class value for a given dataset


def predict_class(summaries, row):
    prob = class_prob(summaries, row)
    best_label, best_prob = None, -1
    for class_val, prob in prob.items():
        if best_prob < prob or best_label is None:
            best_prob = prob
            best_label = class_val
    return best_label

In [11]:
# K-fold validation, seperating dataset into k folds


def sep_into_kfold(dataset, no_fold):
    dataset_copy = list(dataset)
    dataset_sep = list()
    size = int(len(dataset) / no_fold)
    for k in range(n_foldataset):
        fold = list()
        while len(fold) < size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_sep.append(fold)
    return dataset_sep

In [12]:
# Naive Bayes Algorithm


def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict_class(summarize, row)
        predictions.append(output)
    return(predictions)

In [13]:
# Applying cross validation


def cross_val(dataset, algorithm, no_fold, *args):
    scores = list()
    diff = []
    Foldataset = sep_into_kfold(dataset, no_fold)
    for f in Foldataset:
        train_set = list(Foldataset)
        train_set.remove(f)
        train_set = sum(train_set, [])
        test_set = list()
        for r in f:
            r_copy = list(r)
            test_set.append(r_copy)
            r_copy[-1] = None
        pred = algorithm(train_set, test_set, *args)
        act = [r[-1] for r in f]
        acc, k = accuracy(act, pred)
        diff.append(k)
        scores.append(acc)

    mse = 1/len(dataset) * sum(diff)
    return [scores, mse-1]

In [14]:
# Data conversion from categorical to numeric


def str_to_int(dataset, col):
    column = [row[col] for row in dataset]
    unique_val = set(column)
    update_col = dict()
    for i, value in enumerate(unique_val):
        update_col[value] = i + 1
    for row in dataset:
        row[col] = update_col[row[col]]
    print("\t", update_col)

In [15]:
# Convert string column to int


def str_column_to_int(dataset, column):
    for row in dataset:
        row[column] = int(row[column].strip())

In [16]:
# Displaying dataset information


def dataset_info(dataset):
    print("\nColumn\t\tColumn Type")
    for col in range(len(dataset[0])):
        print("\ncolumn", col+1, end="\t")
        if dataset[0][col].isdigit():
            print("Numeric", end="\t")
        else:
            if isinstance(dataset[0][col], str):
                column = [row[col] for row in dataset]
                unique_val = set(column)
                if len(unique_val) <= 5:
                    print("Categorical", end="\t")
                else:
                    print("Object", end="\t")
    print("\n No. of instances: ", len(dataset))

In [17]:
# Loading CSV file


def load_csv(file):
    dataset = list()
    with open(file, 'r') as f:
        csv_reader = reader(f)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset


<h4>1)Applying Naive Bayes algorithm on Hayes Roth Dataset</h4>

In [18]:
seed(1)
fileList = ['C:/Users/siddh/Desktop/Assignment 1/hayes-roth.csv']
for file in fileList:
    dataset = load_csv(file)
    print("\n\nDataset Name: ", file)



Dataset Name:  C:/Users/siddh/Desktop/Assignment 1/hayes-roth.csv


In [19]:
# Displaying shape of the dataset
print("Dataset shape: "+str(len(dataset))+" x "+str(len(dataset[0])))

Dataset shape: 132 x 6


In [20]:
# Displaying dataset information
for file in fileList:
    dataset = load_csv(file)
    print("Dataset Information:-")
    dataset_info(dataset)
for file in fileList:
    dataset = load_csv(file)

Dataset Information:-

Column		Column Type

column 1	Numeric	
column 2	Numeric	
column 3	Numeric	
column 4	Numeric	
column 5	Numeric	
column 6	Numeric	
 No. of instances:  132


In [21]:
# converting String columns to integers
print("\nConverting String columns to integers:- ")
for col in range(0, len(dataset[0])):
    if file == 'hayes-roth.csv':
        str_column_to_int(dataset, col)
    else:
        str_to_int(dataset, col)


Converting String columns to integers:- 
	 {'110': 1, '92': 2, '128': 3, '22': 4, '71': 5, '41': 6, '118': 7, '124': 8, '126': 9, '127': 10, '30': 11, '89': 12, '130': 13, '97': 14, '87': 15, '70': 16, '10': 17, '52': 18, '109': 19, '112': 20, '59': 21, '85': 22, '49': 23, '90': 24, '101': 25, '31': 26, '44': 27, '12': 28, '114': 29, '15': 30, '48': 31, '32': 32, '91': 33, '26': 34, '76': 35, '39': 36, '1': 37, '129': 38, '67': 39, '11': 40, '47': 41, '63': 42, '13': 43, '25': 44, '132': 45, '19': 46, '64': 47, '45': 48, '23': 49, '121': 50, '53': 51, '111': 52, '4': 53, '95': 54, '102': 55, '43': 56, '117': 57, '7': 58, '29': 59, '105': 60, '122': 61, '60': 62, '103': 63, '99': 64, '84': 65, '14': 66, '28': 67, '131': 68, '61': 69, '93': 70, '108': 71, '106': 72, '27': 73, '46': 74, '65': 75, '56': 76, '79': 77, '72': 78, '88': 79, '55': 80, '33': 81, '78': 82, '119': 83, '42': 84, '98': 85, '40': 86, '58': 87, '21': 88, '8': 89, '50': 90, '68': 91, '86': 92, '18': 93, '74': 94, '20'

In [22]:
# evaluating algorithm by seperating the dataset into 10-folds
n_foldataset = 10
scores, mse = cross_val(dataset, naive_bayes, n_foldataset)
print('\nScores:- \n%s' % scores)
print('Mean Accuracy : %.3f%%' % (sum(scores)/float(len(scores))))
print("Mean squared error :", mse)


def get_std_dev(scores):
    n = len(scores)
    mean = sum(scores) / n
    var = sum((x - (sum(scores)/float(len(scores))))**2 for x in scores) / n
    std_dev = var ** 0.5
    return std_dev


print("std_dev :", get_std_dev(scores))


Scores:- 
[76.92307692307693, 69.23076923076923, 76.92307692307693, 46.15384615384615, 69.23076923076923, 69.23076923076923, 69.23076923076923, 61.53846153846154, 61.53846153846154, 61.53846153846154]
Mean Accuracy : 66.154%
Mean squared error : 0.07575757575757569
std_dev : 8.565791327430805


<h4>2)Applying Naive Bayes algorithm on Car Dataset</h4>

In [23]:
seed(1)
fileList = ['C:/Users/siddh/Desktop/Assignment 1/car.csv']
for file in fileList:
    dataset = load_csv(file)
    print("\n\nDataset Name: ", file)



Dataset Name:  C:/Users/siddh/Desktop/Assignment 1/car.csv


In [24]:
# Displaying shape of the dataset
print("Dataset shape: "+str(len(dataset))+" x "+str(len(dataset[0])))

Dataset shape: 1728 x 7


In [25]:
# Displaying dataset information
for file in fileList:
    dataset = load_csv(file)
    print("Dataset Information:-")
    dataset_info(dataset)
for file in fileList:
    dataset = load_csv(file)

Dataset Information:-

Column		Column Type

column 1	Categorical	
column 2	Categorical	
column 3	Numeric	
column 4	Numeric	
column 5	Categorical	
column 6	Categorical	
column 7	Categorical	
 No. of instances:  1728


In [26]:
# converting String columns to integers
print("\nConverting String columns to integers:- ")
for col in range(0, len(dataset[0])):
    if file == 'car.csv':
        str_column_to_int(dataset, col)
    else:
        str_to_int(dataset, col)


Converting String columns to integers:- 
	 {'vhigh': 1, 'low': 2, 'med': 3, 'high': 4}
	 {'vhigh': 1, 'low': 2, 'med': 3, 'high': 4}
	 {'3': 1, '5more': 2, '2': 3, '4': 4}
	 {'more': 1, '2': 2, '4': 3}
	 {'small': 1, 'med': 2, 'big': 3}
	 {'high': 1, 'med': 2, 'low': 3}
	 {'unacc': 1, 'good': 2, 'acc': 3, 'vgood': 4}


In [27]:
# evaluating algorithm by seperating the dataset into 10-folds
n_foldataset = 10
scores, mse = cross_val(dataset, naive_bayes, n_foldataset)
print('\nScores:- \n%s' % scores)
print('Mean Accuracy : %.3f%%' % (sum(scores)/float(len(scores))))
print("Mean squared error :", mse)


def get_std_dev(scores):
    n = len(scores)
    mean = sum(scores) / n
    var = sum((x - (sum(scores)/float(len(scores))))**2 for x in scores) / n
    std_dev = var ** 0.5
    return std_dev


print("std_dev :", get_std_dev(scores))


Scores:- 
[74.4186046511628, 73.83720930232558, 75.5813953488372, 65.11627906976744, 75.0, 79.65116279069767, 75.5813953488372, 69.76744186046511, 72.67441860465115, 75.0]
Mean Accuracy : 73.663%
Mean squared error : 0.427662037037037
std_dev : 3.6958639710029795


<h4>3)Applying Naive Bayes algorithm on Breast-Cancer Dataset</h4>

In [28]:
seed(1)
fileList = ['C:/Users/siddh/Desktop/Assignment 1/breast-cancer.csv']
for file in fileList:
    dataset = load_csv(file)
    print("\n\nDataset Name: ", file)



Dataset Name:  C:/Users/siddh/Desktop/Assignment 1/breast-cancer.csv


In [29]:
# Displaying shape of the dataset
print("Dataset shape: "+str(len(dataset))+" x "+str(len(dataset[0])))

Dataset shape: 286 x 10


In [30]:
# Displaying dataset information
for file in fileList:
    dataset = load_csv(file)
    print("Dataset Information:-")
    dataset_info(dataset)
for file in fileList:
    dataset = load_csv(file)

Dataset Information:-

Column		Column Type

column 1	Categorical	
column 2	Object	
column 3	Categorical	
column 4	Object	
column 5	Object	
column 6	Categorical	
column 7	Numeric	
column 8	Categorical	
column 9	Object	
column 10	Categorical	
 No. of instances:  286


In [31]:
# converting String columns to integers
print("\nConverting String columns to integers:- ")
for col in range(0, len(dataset[0])):
    if file == 'breast-cancer.csv':
        str_column_to_int(dataset, col)
    else:
        str_to_int(dataset, col)


Converting String columns to integers:- 
	 {'recurrence-events': 1, 'no-recurrence-events': 2}
	 {'20-29': 1, '70-79': 2, '40-49': 3, '50-59': 4, '60-69': 5, '30-39': 6}
	 {'premeno': 1, 'lt40': 2, 'ge40': 3}
	 {'15-19': 1, '50-54': 2, '5-9': 3, '20-24': 4, '10-14': 5, '0-4': 6, '30-34': 7, '40-44': 8, '25-29': 9, '35-39': 10, '45-49': 11}
	 {'24-26': 1, '12-14': 2, '9-11': 3, '3-5': 4, '0-2': 5, '15-17': 6, '6-8': 7}
	 {'no': 1, 'yes': 2, '?': 3}
	 {'3': 1, '1': 2, '2': 3}
	 {'right': 1, 'left': 2}
	 {'central': 1, 'right_up': 2, 'left_low': 3, 'left_up': 4, '?': 5, 'right_low': 6}
	 {'no': 1, 'yes': 2}


In [32]:
# evaluating algorithm by seperating the dataset into 10-folds
n_foldataset = 10
scores, mse = cross_val(dataset, naive_bayes, n_foldataset)
print('\nScores:- \n%s' % scores)
print('Mean Accuracy : %.3f%%' % (sum(scores)/float(len(scores))))
print("Mean squared error :", mse)


def get_std_dev(scores):
    n = len(scores)
    mean = sum(scores) / n
    var = sum((x - (sum(scores)/float(len(scores))))**2 for x in scores) / n
    std_dev = var ** 0.5
    return std_dev


print("std_dev :", get_std_dev(scores))


Scores:- 
[71.42857142857143, 78.57142857142857, 67.85714285714286, 64.28571428571429, 75.0, 96.42857142857143, 89.28571428571429, 75.0, 85.71428571428571, 78.57142857142857]
Mean Accuracy : 78.214%
Mean squared error : 0.5629370629370629
std_dev : 9.37457482029049
