In [1]:
import numpy as np
import random
import math
import csv

In [2]:
# we use this method to separate the positive and negative data in training, validation and test sets
def separateDataSet(positive, negative):
    # and separate the examples in training, validation and test sets
    # 60% training, 20% validation, 20% test
    trainCutoff = int(len(negative) * 0.6)
    validCutoff = int(len(negative) * 0.8)
    random.shuffle(negative)
    random.shuffle(positive)

    negTrain = negative[:trainCutoff]
    negValid = negative[trainCutoff:validCutoff]
    negTest = negative[validCutoff:]

    posTrain = positive[:trainCutoff]
    posValid = positive[trainCutoff:validCutoff]
    posTest = positive[validCutoff:]

    test = np.concatenate([negTest, posTest])
    train = np.concatenate([negTrain, posTrain])
    validation = np.concatenate([negValid, posValid])
    np.random.shuffle(validation)
    np.random.shuffle(test)
    np.random.shuffle(train)

    return train, validation, test


In [3]:
# we use this helper method to load the DS1 or DS2 data from the training, validation and test csv files
def loadDataSet(number):
    train = []
    validation = []
    test = []
    path = "./DS" + str(number)
    file = open(path + "_training.csv", 'r')
    reader = csv.reader(file)
    for row in reader:
        train.append(row)
    file.close()
    file = open(path + "_validation.csv", 'r')
    reader = csv.reader(file)
    for row in reader:
        validation.append(row)
    file.close()
    file = open(path + "_test.csv", 'r')
    reader = csv.reader(file)
    for row in reader:
        test.append(row)
    file.close()
    train = np.array(train).astype(np.float)
    validation = np.array(validation).astype(np.float)
    test = np.array(test).astype(np.float)

    return train, validation, test


In [4]:
# once we generated the data, we use this method to store all the information in csv files
def saveData(number, train, validation, test):
     # we save the data in a file
    file = open("DS" + str(number) + "_test.csv", 'w')
    writer = csv.writer(file)
    for i in range(len(test)):
        writer.writerow(test[i])
    file.close()

    file = open("DS" + str(number) + "_validation.csv", 'w')
    writer = csv.writer(file)
    for i in range(len(validation)):
        writer.writerow(validation[i])
    file.close()

    file = open("DS" + str(number) + "_training.csv", 'w')
    writer = csv.writer(file)
    for i in range(len(train)):
        writer.writerow(train[i])
    file.close()

In [5]:
# this method will perform the Gaussian discriminant analysis on the DS1 or DS2 depending on the number provided
# it will use maximum likelihood method to learn parameters
def GDA(train, validation, test, number):
    # to learn the means, we average over all vectors of that belong to a specific class
    LMean0 = np.zeros(len(test[0]) - 1)
    LMean1 = np.zeros(len(test[0]) - 1)  # we do not want to keep the tag that we added at the end of the
    c0count = 0
    c1count = 0

    # the way we have set up the training, validation and test data, we know that half of the data points
    # belong to one class and the other half to the other class.
    # therefore we do not need to train the class probabilities, as we know it will be 0.5 for both
    
    
    for k in range(len(train)):
        temp = train[k][:- 1]
        if train[k][-1] == 1:
            # we check if this is class 0 or class 1
            LMean1 += temp
            c1count += 1
        else:
            LMean0 += temp
            c0count += 1

    LMean1 = np.array(LMean1 / c1count).reshape((len(LMean1), 1))
    LMean0 = np.array(LMean0 / c0count).reshape((len(LMean0), 1))

    # now that we have the learned means, we can compute the leaned covariance matrix
    # for that we need to compute S0 and S1

    S0 = np.zeros((len(train[0]) - 1, len(train[0]) - 1))
    S1 = np.zeros((len(train[0]) - 1, len(train[0]) - 1))

    c1count = 0
    c0count = 0
    for k in range(len(train)):
        temp = train[k][:len(train[k]) - 1].reshape((len(train[k]) - 1, 1))
        if train[k][-1] == 1:
            diff = temp - LMean1
            diff = np.array(diff)
            S1 += np.dot(diff, diff.T)
            c1count += 1

        else:
            diff = temp - LMean0
            S0 += np.dot(diff, diff.T)
            c0count += 1

    S0 = S0 / c0count
    S1 = S1 / c1count

    # so now the learned Cov matrix is 0.5*S0 + 0.5*S1

    Lcov = 0.5 * S0 + 0.5 * S1

    # now we test the model with the parameters learned
    means = [LMean0, LMean1]
    falsePos = 0
    truePos = 0
    falseNeg = 0
    trueNeg = 0
    for k in range(len(test)):
        predicted = 0
        result = [0, 0]
        realValue = test[k][-1]
        temp = test[k][:len(test[k]) - 1].reshape((len(test[k]) - 1, 1))
        # we want to check to evaluate the probability that this value belongs to class 0 and the probability that it
        # belongs to class 1. And we will select the class that has the highest prob
        det = np.linalg.det(2 * math.pi * Lcov)
        for i in range(len(means)):
            temp1 = temp - means[i]
            inv = np.linalg.inv(Lcov)
            expValue = -0.5 * np.matmul(temp1.T, np.matmul(inv, temp1))
            prob = 1 / math.sqrt(det) * math.exp(expValue)
            result[i] = prob

        if result[1] > result[0]:
            predicted = 1

        # depending on what we predicted, and the actual value we will increment specific values
        if predicted == 0:
            if realValue == 0:
                trueNeg += 1
            else:
                falseNeg += 1
        else:
            if realValue == 1:
                truePos += 1
            else:
                falsePos += 1

    # now we have all the values, we can compute accuracy, precision, recall and F-Measure
    accuracy = (truePos + trueNeg) / len(test)
    precision = truePos / (truePos + falsePos)
    recall = truePos / (truePos + falseNeg)
    Fmeasure = 2 * precision*recall / (precision + recall)
    LMean1 = LMean1.reshape((len(LMean1)))
    LMean0 = LMean0.reshape((len(LMean0)))
    file = open("DS" + str(number)+"_LMeans.csv", 'w')
    writer = csv.writer(file)
    writer.writerow(LMean0)
    writer.writerow(LMean1)
    file.close()

    file = open("DS" + str(number)+"_Lcov.csv", 'w')
    writer = csv.writer(file)
    for k in range(len(Lcov)):
        writer.writerow(Lcov[k])

    file.close()

    return {"accuracy ": accuracy, "precision" : precision, "recall" :recall, "fmeasure" :Fmeasure}

In [6]:
# this method will perform the K- nearest neighbour algorithm on the data sets provided
# we test a number of k values, and store in a csv file the results for each k value
# we then select value that lead to the best validation fmeasure, and we test it on the test data
# and we return those values


def KNN(train, validation, test, number):
    # we first calculate the distance from every vector in validation to all the vectors in the training set
    # and we store it in the totalDistances array
    totalDistances = []
    for i in range(len(validation)):
        x = validation[i][:-1]
        # we want to find the euclidean distance from x to each of the training data points
        euclidean_distance = {}
        for j in range(len(train)):
            sum = 0
            temp = train[j][:-1]  # we remove the class label
            for index in range(len(temp)):
                sum += math.pow(temp[index] - x[index], 2)
            sum = math.sqrt(sum)
            euclidean_distance[j] = sum
        # now we want to find the k points that are the closest to x (ie smallest euclidean distance)
        # ie we want to sort the 'euclidean_distance dictionary by value
        sortedDistances = sorted(euclidean_distance.items(), key=lambda item: item[1])
        # we want to sort on the 2nd value
        totalDistances.append(sortedDistances)

    # and once we have all those distances, we can perform the prediction, by averaging out the value
    # of the k closest vectors

    k_values = np.arange(1, 100, 1)
    file = open("DS" + str(number) + "_KNNresult.csv", 'w')
    writer = csv.writer(file)
    writer.writerow(["k", "F_measure", "precision", "recall"])
    bestfvalue = 0
    bestk = 0
    for k in k_values:
        falseNeg = 0
        falsePos = 0
        truePos = 0
        trueNeg = 0
        performance = []
        for i in range(len(validation)):
            sum = 0
            value = validation[i][-1]
            for j in range(k):
                index = totalDistances[i][j][0]  # we fetch the index of the vector in the train array
                sum += train[index][-1]  # and then with the index we get the class of the vector
            sum = sum / k

            if sum > 0.5:  # if the average across all k closest neighbours is closer to 1, than we predict a value of 1
                if value == 1:
                    truePos += 1
                else:
                    falsePos += 1
            else:
                if value == 0:
                    trueNeg += 1
                else:
                    falseNeg += 1

        precision = truePos / (truePos + falsePos)
        recall = truePos / (truePos + falseNeg)
        fMeasure = 2 * precision * recall / (precision + recall)
        writer.writerow([k, fMeasure, precision, recall])

        if fMeasure > bestfvalue:
            bestfvalue = fMeasure
            bestk = k



    # now we want to evaluate the performance on the test set with the best k value we found from the validation set
    falseNeg = 0
    falsePos = 0
    truePos = 0
    trueNeg = 0
    for i in range(len(test)):
        value = test[i][-1]
        x = test[i][:-1]
        # we want to find the euclidean distance from x to each of the training data points
        euclidean_distance = {}
        for j in range(len(train)):
            sum = 0
            temp = train[j][:-1]  # we remove the class label
            for index in range(len(temp)):
                sum += math.pow(temp[index] - x[index], 2)
            sum = math.sqrt(sum)
            euclidean_distance[j] = sum
        # now we want to find the k points that are the closest to x (ie smallest euclidean distance)
        # ie we want to sort the 'euclidean_distance dictionary by value
        sortedDistances = sorted(euclidean_distance.items(), key=lambda item: item[1])
        sum = 0
        for j in range(bestk):
            # we take the first k values and sum their class labels
            sum += train[sortedDistances[j][0]][-1]
        sum = sum / bestk

        if sum > 0.5:  # if the average across all k closest neighbours is closer to 1, than we predict a value of 1
            if value == 1:
                truePos += 1
            else:
                falsePos += 1
        else:
            if value == 0:
                trueNeg += 1
            else:
                falseNeg += 1

    precision = truePos / (truePos + falsePos)
    recall = truePos / (truePos + falseNeg)
    fmeasure = 2 * precision * recall / (precision + recall)
    file.close()

    return {"best_k_value":bestk, "precision" : precision, "recall" :recall, "fmeasure" :fmeasure}

In [7]:
# this method fetches the data from the data sets provided 
# it then generates data points using the mean and covariances loaded
def Q1():
    file = open("./hwk2_datasets/DS1_m_0.txt")
    lines = file.readlines()
    m0 = lines[0].split(',')
    m0 = m0[:len(m0)-1]
    m0 = np.array(m0).astype(np.float)
    file.close()

    file = open("./hwk2_datasets/DS1_m_1.txt")
    lines = file.readlines()
    m1 = lines[0].split(',')
    m1 = m1[:len(m1)-1]
    m1 = np.array(m1).astype(np.float)
    file.close()

    file = open("./hwk2_datasets/DS1_Cov.txt")
    lines = file.readlines()
    cov = []
    for line in lines:
        temp = line.split(',')
        temp = temp[:len(temp)-1]
        cov.append(np.array(temp).astype(np.float))


    negative = []
    positive = []

    neg = np.random.multivariate_normal(m0, cov, 2000)
    pos = np.random.multivariate_normal(m1, cov, 2000)

    # we add the label that indicates the class in which the example belongs to
    for k in range(len(neg)):
        negative.append(np.append(neg[k], 0))
        positive.append(np.append(pos[k], 1))

    # and separate the examples in training, validation and test sets
    # 60% training, 20% validation, 20% test
    train, validation, test = separateDataSet(positive, negative)
    
    saveData(1, train, validation, test)


In [8]:
# this method is similar to Q1 however we choose different gaussians with certain probabilities when 
# generating the data
def Q4():
    c1means = []  # positive
    c2means = []  # negative
    covMatrices = []
    for i in range(3):
        file = open("./hwk2_datasets/DS2_c1_m" + str(i+1) + ".txt")
        lines = file.readlines()
        mean = (lines[0].split(','))[:-1]
        c1means.append(np.array(mean).astype(np.float))
        file.close()
        file = open("./hwk2_datasets/DS2_c2_m" + str(i+1) + ".txt")
        lines = file.readlines()
        mean = (lines[0].split(','))[:-1]
        c2means.append(np.array(mean).astype(np.float))
        file.close()
        file = open("./hwk2_datasets/DS2_Cov"+str(i+1) + ".txt")
        temp = []
        lines = file.readlines()
        for line in lines:
            temp.append(np.array((line.split(','))[:-1]).astype(np.float))
        covMatrices.append(temp)
        file.close()


    # now we want to generate sampling data
    # 2000 from positive class and 2000 from negative class 
    # for each point, we select the gaussian mean and covariance matrix with a certain probability

    positive = []
    negative = []
    for i in range(2000):
        gaussSelected = 0
        rand = np.random.rand()
        if rand < 0.42:
            gaussSelected = 1
        elif rand < 0.90:
            gaussSelected = 2
        pos = np.random.multivariate_normal(c1means[gaussSelected], covMatrices[gaussSelected], 1)[0]

        neg = np.random.multivariate_normal(c2means[gaussSelected], covMatrices[gaussSelected], 1)[0]
        pos = np.append(pos, 1)
        neg = np.append(neg, 0)
        positive.append(pos)
        negative.append(neg)

    negative = np.array(negative)
    positive = np.array(positive)
    train, validation, test = separateDataSet(positive, negative)
    saveData(2, train, validation, test)


In [9]:
# Question 1 
Q1()

In [10]:
# Question 2
train, validation, test = loadDataSet(1)
print(GDA(train, validation, test,1))

{'accuracy ': 0.96625, 'precision': 0.9745547073791349, 'recall': 0.9575, 'fmeasure': 0.9659520807061791}


In [11]:
# Question 3
train, validation, test = loadDataSet(1)
print(KNN(train, validation, test,1))

{'best_k_value': 33, 'precision': 0.5532544378698225, 'recall': 0.4675, 'fmeasure': 0.5067750677506775}


In [12]:
# Question 4

Q4()

In [13]:
# Question 5
# 1)

train, validation, test = loadDataSet(2)
print(GDA(train, validation, test,2))



{'accuracy ': 0.5075, 'precision': 0.5081081081081081, 'recall': 0.47, 'fmeasure': 0.4883116883116883}


In [None]:
# 2-3)

train, validation, test = loadDataSet(2)
print(KNN(train, validation, test,2))