In [41]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

# Read data from DataSets
mean0 = pd.read_csv("data/DS1_m_0.txt", header=None)
mean1 = pd.read_csv("data/DS1_m_1.txt", header=None)
cov = pd.read_csv("data/DS1_cov.txt", header=None)
# Drop end column (not useful to us)
mean0.drop([20], axis=1, inplace=True)
mean1.drop([20], axis=1, inplace=True)
cov.drop([20], axis=1, inplace=True)
# Change to matrices
mean0_m = mean0.as_matrix()[0]
mean1_m = mean1.as_matrix()[0]
cov_m = cov.as_matrix()[0]

# Generate 2000 examples for each class
dataEx = 2000
class0 = pd.DataFrame(np.random.multivariate_normal(mean0_m, cov, dataEx))
class1 = pd.DataFrame(np.random.multivariate_normal(mean1_m, cov, dataEx))
# Add classification column
class0[20] = 0
class1[20] = 1
class0_m = class0.as_matrix()
class1_m = class1.as_matrix()

# Split data into train, valid, test sets
allData = np.concatenate((class0_m, class1_m), axis=0)
np.random.shuffle(allData)
trainSet = pd.DataFrame(allData[0:2400])
validSet = pd.DataFrame(allData[2400:3200])
testSet = pd.DataFrame(allData[3200:4000])

# Save completed datasets
pd.DataFrame(allData).to_csv("dataGenerated/DS1_data.csv", index=False, header=False)
trainSet.to_csv("dataGenerated/DS1_train.csv", index=False, header=False)
validSet.to_csv("dataGenerated/DS1_valid.csv", index=False, header=False)
testSet.to_csv("dataGenerated/DS1_test.csv", index=False, header=False)

# Split training data
trainSet0 = trainSet[trainSet[20] == 0]
trainSet1 = trainSet[trainSet[20] == 1]
testOut = testSet[20]
# Drop end column
trainSet0.drop([20], axis=1, inplace=True)
trainSet1.drop([20], axis=1, inplace=True)
testSet.drop([20], axis=1, inplace=True)
# Get length of each training set
numData0 = len(trainSet0)
numData1 = len(trainSet1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [44]:
# Get probability of each training set
allDataPts = numData0 + numData1
prob0 = numData0 / allDataPts
prob1 = numData1 / allDataPts

# Get mean of each column
mean0 = np.array(trainSet0.mean())
mean1 = np.array(trainSet1.mean())

# Compute covariance matrices to get w0 & w1
cov0 = np.array(trainSet0-mean0)
cov1 = np.array(trainSet1-mean1)
s0 = np.matmul(cov0.T, cov0)
s1 = np.matmul(cov1.T, cov1)
cov = (s0 + s1) / allDataPts
cov_inv = np.linalg.inv(cov)
w0 = (math.log(prob0)-math.log(prob1)) - (np.matmul(np.matmul(mean0.T, cov_inv), mean0) - np.matmul(np.matmul(mean1.T, cov_inv), mean1))/2
w1 = np.matmul(cov_inv, (mean0 - mean1))
print("w0 = ", w0)
print("w1 = ", w1, "\n")

# Compute GDA model
outPredict = np.matmul(testSet, w1) + w0
outPredict[outPredict > 0] = 0
outPredict[outPredict < 0] = 1

# Compute confusion matrix
tp, tn, fp, fn = 0, 0, 0, 0
for i in range(len(testOut)):
    predVal = outPredict[i]
    actualVal = testOut[i]

    # Compare actual to predicted values
    if (predVal == 1 and predVal == actualVal):
        tp += 1
    elif (predVal == 1 and predVal != actualVal):
        fp += 1
    elif (predVal == 0 and predVal == actualVal):
        tn += 1
    elif (predVal == 0 and predVal != actualVal):
        fn += 1
        
# Compute results
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_measure = 2 * (precision * recall) / (precision + recall)

# Display results
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F-measure: ", f_measure)

w0 =  27.8164426431
w1 =  [ 14.42714257  -8.63783728  -6.00465852  -3.06605204  -9.93909933
  -3.98184591  17.3143163  -24.67928237 -29.61238378   8.97642129
 -13.10130264 -12.69421249  16.3167267   13.3892369   -5.7368908
  13.36693986  29.80521057  -6.84008656  -0.74323535  -5.37769998] 

Accuracy:  0.97
Precision:  0.9587378640776699
Recall:  0.9825870646766169
F-measure:  0.9705159705159705
