In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

# Read data from DataSets
mean0 = pd.read_csv("data/DS1_m_0.txt", header=None)
mean1 = pd.read_csv("data/DS1_m_1.txt", header=None)
cov = pd.read_csv("data/DS1_cov.txt", header=None)
# Drop end column (not useful to us)
mean0.drop([20], axis=1, inplace=True)
mean1.drop([20], axis=1, inplace=True)
cov.drop([20], axis=1, inplace=True)
# Change to matrices
mean0_m = mean0.as_matrix()[0]
mean1_m = mean1.as_matrix()[0]
cov_m = cov.as_matrix()[0]

# Generate 2000 examples for each class
dataEx = 2000
class0 = pd.DataFrame(np.random.multivariate_normal(mean0_m, cov, dataEx))
class1 = pd.DataFrame(np.random.multivariate_normal(mean1_m, cov, dataEx))
# Add classification column for identification
class0[20] = 0
class1[20] = 1
class0_m = class0.as_matrix()
class1_m = class1.as_matrix()

# Split data into train, valid, test sets
allData = np.concatenate((class0_m, class1_m), axis=0)
np.random.shuffle(allData)
trainSet = pd.DataFrame(allData[0:2400])
validSet = pd.DataFrame(allData[2400:3200])
testSet = pd.DataFrame(allData[3200:4000])

# Save completed datasets
pd.DataFrame(allData).to_csv("dataGenerated/DS1_data.csv", index=False, header=False)
trainSet.to_csv("dataGenerated/DS1_train.csv", index=False, header=False)
validSet.to_csv("dataGenerated/DS1_valid.csv", index=False, header=False)
testSet.to_csv("dataGenerated/DS1_test.csv", index=False, header=False)

# Split training data
trainSet0 = trainSet[trainSet[20] == 0]
trainSet1 = trainSet[trainSet[20] == 1]
testOut = testSet[20]
# Drop end column
trainSet0.drop([20], axis=1, inplace=True)
trainSet1.drop([20], axis=1, inplace=True)
testSet.drop([20], axis=1, inplace=True)
# Get length of each training set
numData0 = len(trainSet0)
numData1 = len(trainSet1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
# Get probability of each training set
allDataPts = numData0 + numData1
prob0 = numData0 / allDataPts
prob1 = numData1 / allDataPts

# Get mean of each column
mean0 = np.array(trainSet0.mean())
mean1 = np.array(trainSet1.mean())

# Compute covariance matrices to get w0 & w1
cov0 = np.array(trainSet0-mean0)
cov1 = np.array(trainSet1-mean1)
s0 = np.matmul(cov0.T, cov0)
s1 = np.matmul(cov1.T, cov1)
cov = (s0 + s1) / allDataPts
cov_inv = np.linalg.inv(cov)
w0 = (math.log(prob0)-math.log(prob1)) - (np.matmul(np.matmul(mean0.T, cov_inv), mean0) - np.matmul(np.matmul(mean1.T, cov_inv), mean1))/2
w1 = np.matmul(cov_inv, (mean0 - mean1))
print("Covariance matrix: ", cov, "\n")
print("w0 = ", w0)
print("w1 = ", w1, "\n")

# Compute GDA model
outPredict = np.matmul(testSet, w1.T) + w0
sigmoid = np.exp(outPredict) / (np.exp(outPredict) + 1)
for i in range(len(sigmoid)):
    if (sigmoid[i] > 0.5):
        sigmoid[i] = 0
    else:
        sigmoid[i] = 1

# Compute confusion matrix
tp, tn, fp, fn = 0, 0, 0, 0
for j in range(len(testOut)):
    predVal = sigmoid[j]
    actualVal = testOut[j]

    # Compare actual to predicted values
    if (predVal == 1 and predVal == actualVal):
        tp += 1
    elif (predVal == 1 and predVal != actualVal):
        fp += 1
    elif (predVal == 0 and predVal == actualVal):
        tn += 1
    elif (predVal == 0 and predVal != actualVal):
        fn += 1
        
confMatrix = [[0, 0], [0, 0]]
confMatrix[0][0] = tp
confMatrix[0][1] = fp
confMatrix[1][0] = fn
confMatrix[1][1] = tn
        
# Compute results
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_measure = (2 * (precision * recall)) / (precision + recall)

# Display results
print("Confusion Matrix: ", confMatrix, "\n")
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F-measure: ", f_measure)

Covariance matrix:  [[ 7.93913535  5.4639525   6.18541296  5.12986087  5.84825023  6.16126778
   4.58723466  5.3253898   5.05168643  5.14897023  3.96186492  5.27032408
   7.10013063  6.07770207  6.05562005  5.80922934  5.92604086  5.7148829
   5.59337239  5.91432865]
 [ 5.4639525   6.79156059  5.36593405  4.3079233   5.4917599   5.52670572
   4.2674072   3.79760035  4.13309475  4.92972643  3.3868653   4.54708631
   5.83263917  5.15031081  5.41611354  5.07126328  5.63196441  5.07627892
   5.31013018  5.29735717]
 [ 6.18541296  5.36593405  7.39129534  4.93149844  5.88035605  6.64448278
   4.56451724  4.77728556  4.97435374  5.07806388  3.33889309  4.71621316
   6.40153691  5.36505165  6.18015232  5.94953952  6.39516212  5.05161106
   4.77730357  5.15942659]
 [ 5.12986087  4.3079233   4.93149844  5.72055317  5.31735409  4.54193084
   3.61801218  4.23431571  3.4420127   4.15493832  2.78538491  4.10013154
   5.82888469  4.87614688  4.79464715  4.96038323  4.75651672  4.53694658
   3.9104366