In [None]:
import numpy as np
import pandas as pd
import math
import time

In [None]:
#store dataset from Website 
data = pd.io.parsers.read_csv(
    filepath_or_buffer='http://www.cse.scu.edu/~yfang/coen140/iris.data',
    header=None,
    sep=',',
    )
print (data.head())
print(data[4].value_counts())

In [None]:
#break sample into 80% for training and 20% for testing
train_data = data[0:40].append(data[50:90]).append(data[100:140])
test_data  = data[40:50].append(data[90:100]).append(data[140:150])

In [None]:
#categorize flowers
setosa = train_data[train_data[4] == 'Iris-setosa']
versicolor = train_data[train_data[4] == 'Iris-versicolor']
virginica = train_data[train_data[4] == 'Iris-virginica']

In [None]:
print(setosa.head())
print(versicolor.head())
print(virginica.head())



In [None]:
#drop labels
train_setosa = setosa.drop(4,axis=1)
train_versicolor = versicolor.drop(4,axis=1)
train_virginica = virginica.drop(4,axis=1)

In [None]:
#probability density function
def PDF(x, mean, cov):
    k = x.shape[0]
    first_val = 1/math.sqrt(((2.0 * math.pi) ** k) * np.linalg.det(cov)) 
    second_val = math.exp(-0.5 * np.dot(np.dot((x - mean),np.linalg.inv(cov)),
                                        (x - mean)[np.newaxis].T))
    return first_val * second_val

In [None]:
#computing sample mean
def my_means(matrix):
    means = []
    for attribute in matrix.values.T:
        means.append(attribute.sum()/float(matrix.shape[0]))
    return np.array(means)

In [None]:
#computing sample covariance
def my_cov(X):
    total_cov = np.zeros((X.shape[1], X.shape[1]))
    mean = my_means(X)
    for row in range(X.shape[0]):
        total_cov += np.outer((X.iloc[row].values - mean),(X.iloc[row].values - mean))
    cov = total_cov/float(X.shape[0]-1)
    return cov

In [None]:
means = {}
means['setosa']  = my_means(train_setosa)
means['versicolor'] = my_means(train_versicolor)
means['virginica'] = my_means(train_virginica)

covs = {}
covs['setosa'] = my_cov(train_setosa)
covs['versicolor'] = my_cov(train_versicolor)
covs['virginica'] = my_cov(train_virginica)

cov_avg = (covs['setosa'] + covs['versicolor'] + covs['virginica'])/3.0

print (means)
print (covs)

In [None]:
#helper function to calculate accuracy
def accuracy(classifier, subset):
    n_correct = 0
    for row in subset.iterrows():
        x = np.array(row[1][0:4])
        actual = row[1][4]
        
        #classifier = 1: LDA
        #classifier = 2: QDA
        #classifier = 3: QDA with Independent Features
        if classifier == 1:
            if LDA(x, means, cov_avg) == actual:
                n_correct += 1
        elif classifier == 2:
            if QDA(x, means, covs) == actual:
                n_correct += 1
        elif classifier == 3:
            if QDA(x, means, independent_covs) == actual:
                n_correct += 1
        else:
            raise ValueError("Classifier unknown. Please try again")
    accuracy = (n_correct/float(len(subset)) * 100)
    error = 100 - accuracy
    return str(error)

In [None]:
#LDA classifier for training data
def LDA(x, mean, avg_cov):
    prob = {}
    prob['Iris-setosa'] = PDF(x,mean['setosa'], avg_cov)
    prob['Iris-versicolor'] = PDF(x,mean['versicolor'], avg_cov)
    prob['Iris-verginica'] = PDF(x, mean['virginica'], avg_cov)
    
    return max(prob, key = prob.get)

In [None]:
#Evaluate LDA Accuracy on Training Set
print("Training accuracy for LDA: " + accuracy(1, train_data) + "%")

In [None]:
#Evaluate LDA Accuracy on Testing Set 
print("Testing accuracy for LDA: " + accuracy(1, test_data) + "%")

In [None]:
#QDA classifier for training data
def QDA(x, mean, covs):
    prob = {}
    prob['Iris-setosa'] = PDF(x,mean['setosa'], covs)
    prob['Iris-versicolor'] = PDF(x,mean['versicolor'], covs)
    prob['Iris-verginica'] = PDF(x, mean['virginica'], covs)
    
    return max(prob, key = prob.get)

In [None]:
#QDA accuracy for training data
print("Training accuracy for QDA: " + accuracy(2, train_data) + "%")

In [None]:
#QDA Accuracy for testing data
print("Testing accuracy for QDA: " + accuracy(2, test_data) + "%")

In [None]:
#checking if any class is linearly separable

categories = ['Iris-setosa','Iris-versicolor','Iris-virginica']

# run LDA on each separate class
for category in categories:
    flower_class = data[data[4] == category]
    n_correct = 0
    for row in flower_class.iterrows():
        x = np.array(row[1][0:4])
        actual = row[1][4]
        predicted = LDA(x,means,cov_avg)
        if predicted == actual:
            n_correct += 1
    accuracy = (n_correct/float(len(flower_class)) * 100)
    error = 100 - accuracy
    print(category, "error rate:", error, "%")

In [None]:
#assume features are independent 

#convert cov matrices to diagonal 
independent_covs = {}
for category, cov in covs.items():
    
    # setup each category to have a 4 x 4 identity matrix
    independent_covs[category] = np.zeros(cov.shape)
    
    # add diagonal values from cov matrices to identity matrix
    for row in range(cov.shape[0]):
        for col in range(cov.shape[1]):
            if row == col:
                independent_covs[category][row][col] = cov[row][col]
    print(independent_covs[category])

In [None]:
#calculate time & error rates for QDA 
qda_start_time = time.time() * 1000
print("Training Accuracy for QDA: " + accuracy(2, train) + "%")
print("Testing Accuracy for QDA: " + accuracy(2, test) + "%")
print("Time taken for QDA:", (time.time() * 1000)- qda_start_time, "ms\n")

# calculate the time and error rates for QDA with independent features
independent_start_time = time.time() * 1000
print("Training accuracy for QDA with independent features : " + calculate_accuracy(3, train) + "%")
print("Testing accuracy for QDA with independent features : " + calculate_accuracy(3, test) + "%")
print("Time taken for QDA with independent features:", (time.time() * 1000) - independent_start_time, "ms")