In [17]:
import numpy as np
import pandas as pd
import math

# Read data from DataSets
mean0 = pd.read_csv("data/DS1_m_0.txt", header=None)
mean1 = pd.read_csv("data/DS1_m_1.txt", header=None)
cov = pd.read_csv("data/DS1_Cov.txt", header=None)
# Drop end column (not useful to us)
mean0.drop([20], axis=1, inplace=True)
mean1.drop([20], axis=1, inplace=True)
cov.drop([20], axis=1, inplace=True)
# Change to matrices
mean0_m = mean0.as_matrix()[0]
mean1_m = mean1.as_matrix()[0]
cov_m = cov.as_matrix()[0]

# Generate 2000 examples for each class
dataEx = 2000
class0 = pd.DataFrame(np.random.multivariate_normal(mean0_m, cov, dataEx))
class1 = pd.DataFrame(np.random.multivariate_normal(mean1_m, cov, dataEx))
# Add classification column and convert to matrix
class0[20] = 0
class1[20] = 1
class0_m = class0.as_matrix()
class1_m = class1.as_matrix()

# Split data into train, valid, test sets
allData = np.concatenate((class0_m, class1_m), axis=0)
np.random.shuffle(allData)
trainSet = pd.DataFrame(allData[0:2400])
validSet = pd.DataFrame(allData[2400:3200])
testSet = pd.DataFrame(allData[3200:4000])

### 1. Save completed datasets ###
pd.DataFrame(allData).to_csv("dataGenerated/DS1_data.csv", index=False, header=False)
trainSet.to_csv("dataGenerated/DS1_train.csv", index=False, header=False)
validSet.to_csv("dataGenerated/DS1_valid.csv", index=False, header=False)
testSet.to_csv("dataGenerated/DS1_test.csv", index=False, header=False)

In [20]:
### 2. GDA Approach ###

# Split training data
trainSet0 = trainSet[trainSet[20] == 0]
trainSet1 = trainSet[trainSet[20] == 1]
testOut = testSet[20]
# Drop end column
trainSet0.drop([20], axis=1, inplace=True)
trainSet1.drop([20], axis=1, inplace=True)
trainSet.drop([20], axis=1, inplace=True)
testSet.drop([20], axis=1, inplace=True)

# Get length of each training set
numData0 = len(trainSet0)
numData1 = len(trainSet1)
# Create nd.array versions of data sets
trainSet_arr = trainSet.as_matrix()
testSet_arr = testSet.as_matrix()

# Get probability of each training set
allDataPts = numData0 + numData1
prob0 = numData0 / allDataPts
prob1 = numData1 / allDataPts

# Get mean of each column
mean0 = np.array(trainSet0.mean())
mean1 = np.array(trainSet1.mean())

# Compute covariance matrices to get w0 & w1
cov0 = np.array(trainSet0-mean0)
cov1 = np.array(trainSet1-mean1)
s0 = np.matmul(cov0.T, cov0)
s1 = np.matmul(cov1.T, cov1)
cov = (s0 + s1) / allDataPts
cov_inv = np.linalg.inv(cov)
w0 = (math.log(prob0)-math.log(prob1)) - (np.matmul(np.matmul(mean0.T, cov_inv), mean0) - np.matmul(np.matmul(mean1.T, cov_inv), mean1))/2
w1 = np.matmul(cov_inv, (mean0 - mean1))
print("Covariance matrix: ", cov, "\n")
print("w0: ", w0)
print("w1: ", w1, "\n")

# Compute GDA model
outPredict = np.matmul(testSet, w1.T) + w0
sigmoid = np.exp(outPredict) / (np.exp(outPredict) + 1)
for i in range(len(sigmoid)):
    if (sigmoid[i] > 0.5):
        sigmoid[i] = 0
    else:
        sigmoid[i] = 1
        
# Compute confusion matrix values
tp, tn, fp, fn = 0, 0, 0, 0
for j in range(len(testOut)):
    predVal = sigmoid[j]
    actualVal = testOut[j]

    # Compare actual to predicted values
    if (predVal == 1 and predVal == actualVal):
        tp += 1
    elif (predVal == 1 and predVal != actualVal):
        fp += 1
    elif (predVal == 0 and predVal == actualVal):
        tn += 1
    elif (predVal == 0 and predVal != actualVal):
        fn += 1
        
# Create confusion matrix
confMatrix = [[0, 0], [0, 0]]
confMatrix[0][0] = tp
confMatrix[0][1] = fp
confMatrix[1][0] = fn
confMatrix[1][1] = tn
        
# Compute results
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_measure = (2 * (precision * recall)) / (precision + recall)

# Display results
print("Confusion Matrix: ", confMatrix, "\n")
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F-measure: ", f_measure)

Covariance matrix:  [[ 7.56965871  5.35116696  5.95821863  4.81981756  5.47768864  5.95704429
   4.37888331  5.15338055  4.80976459  4.95879917  3.71062406  5.01774296
   6.81049376  5.65901027  5.9135537   5.71352219  5.66374337  5.43252301
   5.34987145  5.66057083]
 [ 5.35116696  6.89997072  5.36619241  4.23133171  5.28046858  5.60710369
   4.22345817  3.90711406  4.08272555  4.94947743  3.18390365  4.57907662
   5.78763808  4.93405907  5.47315963  5.25853881  5.65335448  5.0766118
   5.23412502  5.17383434]
 [ 5.95821863  5.36619241  7.35395397  4.70263686  5.60756847  6.63657833
   4.50152313  4.66275825  4.89814795  5.04851547  3.13199747  4.59100736
   6.26124324  5.02290006  6.1477254   5.98628305  6.25092761  4.88630087
   4.55858002  4.985016  ]
 [ 4.81981756  4.23133171  4.70263686  5.52654937  4.96926381  4.36116252
   3.55152567  4.07785514  3.25941865  4.07684604  2.56177122  3.95928561
   5.56610168  4.41932067  4.57356955  4.81054172  4.52339249  4.29346492
   3.6288836

In [19]:
### 3. k-NN Approach ###

# Store optimal values
k_opt = 0
confMatrix_opt = [[0, 0], [0, 0]]
accuracy_opt = 0.0

# Compute k-NN up to k=30
for k in range(1, 31):
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(testOut)):
        x = trainSet_arr
        y_i = testSet_arr[i]
        
        # Compute and sort distances
        distances = np.power(np.linalg.norm(x - y_i, axis=1), 2)
        neighbours = distances.argsort()[:k]
        if (trainSet_arr[neighbours, 19].sum() > 0):
            predVal = 1
        else:
            predVal = 0
        actualVal = testOut[i]
        
        # Compare actual to predicted values
        if (predVal == 1 and predVal == actualVal):
            tp += 1
        elif (predVal == 1 and predVal != actualVal):
            fp += 1
        elif (predVal == 0 and predVal == actualVal):
            tn += 1
        elif (predVal == 0 and predVal != actualVal):
            fn += 1
        
    # Compute and display accuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print("k: {}, Accuracy: {}".format(k, accuracy))
    
    # Update highest accuracy and store corresponding confusion matrix
    if (accuracy > accuracy_opt):
        accuracy_opt = accuracy
        k_opt = k
        confMatrix_opt[0][0] = tp
        confMatrix_opt[0][1] = fp
        confMatrix_opt[1][0] = fn
        confMatrix_opt[1][1] = tn
        
# Compute results from optimal values
tp = confMatrix_opt[0][0]
fp = confMatrix_opt[0][1]
fn = confMatrix_opt[1][0]
tn = confMatrix_opt[1][1]
precision_opt = tp / (tp + fp)
recall_opt = tp / (tp + fn)
f_measure_opt = (2 * (precision_opt * recall_opt)) / (precision_opt + recall_opt)

# Display results
print("\nOptimal k value: ", k_opt)
print("Confusion Matrix: ", confMatrix_opt, "\n")
print("Accuracy: ", accuracy_opt)
print("Precision: ", precision_opt)
print("Recall: ", recall_opt)
print("F-measure: ", f_measure_opt)

k: 1, Accuracy: 0.535
k: 2, Accuracy: 0.52625
k: 3, Accuracy: 0.52625
k: 4, Accuracy: 0.53375
k: 5, Accuracy: 0.535
k: 6, Accuracy: 0.54125
k: 7, Accuracy: 0.54
k: 8, Accuracy: 0.54375
k: 9, Accuracy: 0.54375
k: 10, Accuracy: 0.54375
k: 11, Accuracy: 0.54125
k: 12, Accuracy: 0.54
k: 13, Accuracy: 0.53875
k: 14, Accuracy: 0.54
k: 15, Accuracy: 0.54
k: 16, Accuracy: 0.54
k: 17, Accuracy: 0.5425
k: 18, Accuracy: 0.5425
k: 19, Accuracy: 0.54
k: 20, Accuracy: 0.53875
k: 21, Accuracy: 0.5375
k: 22, Accuracy: 0.535
k: 23, Accuracy: 0.53625
k: 24, Accuracy: 0.535
k: 25, Accuracy: 0.535
k: 26, Accuracy: 0.53625
k: 27, Accuracy: 0.54125
k: 28, Accuracy: 0.535
k: 29, Accuracy: 0.53625
k: 30, Accuracy: 0.5375

Optimal k value:  8
Confusion Matrix:  [[314, 276], [89, 121]] 

Accuracy:  0.54375
Precision:  0.5322033898305085
Recall:  0.7791563275434243
F-measure:  0.6324269889224572
