In [6]:
import numpy as np
import pandas as pd
import math

# Read data from DataSets
mean1_pos = pd.read_csv("data/DS2_c1_m1.txt", header=None)
mean2_pos = pd.read_csv("data/DS2_c1_m2.txt", header=None)
mean3_pos = pd.read_csv("data/DS2_c1_m3.txt", header=None)
mean1_neg = pd.read_csv("data/DS2_c2_m1.txt", header=None)
mean2_neg = pd.read_csv("data/DS2_c2_m2.txt", header=None)
mean3_neg = pd.read_csv("data/DS2_c2_m3.txt", header=None)
cov1 = pd.read_csv("data/DS2_Cov1.txt", header=None)
cov2 = pd.read_csv("data/DS2_Cov2.txt", header=None)
cov3 = pd.read_csv("data/DS2_Cov3.txt", header=None)
# Drop end column (not useful to us)
mean1_pos.drop([20], axis=1, inplace=True)
mean2_pos.drop([20], axis=1, inplace=True)
mean3_pos.drop([20], axis=1, inplace=True)
mean1_neg.drop([20], axis=1, inplace=True)
mean2_neg.drop([20], axis=1, inplace=True)
mean3_neg.drop([20], axis=1, inplace=True)
cov1.drop([20], axis=1, inplace=True)
cov2.drop([20], axis=1, inplace=True)
cov3.drop([20], axis=1, inplace=True)
# Change to matrices
mean1_pos_m = mean1_pos.as_matrix()[0]
mean2_pos_m = mean2_pos.as_matrix()[0]
mean3_pos_m = mean3_pos.as_matrix()[0]
mean1_neg_m = mean1_neg.as_matrix()[0]
mean2_neg_m = mean2_neg.as_matrix()[0]
mean3_neg_m = mean3_neg.as_matrix()[0]
cov1_m = cov1.as_matrix()[0]
cov2_m = cov2.as_matrix()[0]
cov3_m = cov3.as_matrix()[0]

# Generate 2000 examples for each class
dataEx = 2000
class1_neg = pd.DataFrame(np.random.multivariate_normal(mean1_neg_m, cov1, dataEx))
class2_neg = pd.DataFrame(np.random.multivariate_normal(mean2_neg_m, cov2, dataEx))
class3_neg = pd.DataFrame(np.random.multivariate_normal(mean3_neg_m, cov3, dataEx))
class1_pos = pd.DataFrame(np.random.multivariate_normal(mean1_pos_m, cov1, dataEx))
class2_pos = pd.DataFrame(np.random.multivariate_normal(mean2_pos_m, cov2, dataEx))
class3_pos = pd.DataFrame(np.random.multivariate_normal(mean3_pos_m, cov3, dataEx))
# Add classification column and convert to matrix
class1_neg[20] = 0
class2_neg[20] = 0
class3_neg[20] = 0
class1_pos[20] = 1
class2_pos[20] = 1
class3_pos[20] = 1
class1_neg_m = class1_neg.as_matrix()
class2_neg_m = class2_neg.as_matrix()
class3_neg_m = class3_neg.as_matrix()
class1_pos_m = class1_pos.as_matrix()
class2_pos_m = class2_pos.as_matrix()
class3_pos_m = class3_pos.as_matrix()

# Split data into train, valid, test sets
allData1 = np.concatenate((class1_neg_m, class1_pos_m), axis=0)
allData2 = np.concatenate((class2_neg_m, class2_pos_m), axis=0)
allData3 = np.concatenate((class3_neg_m, class3_pos_m), axis=0)
np.random.shuffle(allData1)
np.random.shuffle(allData2)
np.random.shuffle(allData3)
trainSet1 = pd.DataFrame(allData1[0:2400])
trainSet2 = pd.DataFrame(allData2[0:2400])
trainSet3 = pd.DataFrame(allData3[0:2400])
validSet1 = pd.DataFrame(allData1[2400:3200])
validSet2 = pd.DataFrame(allData2[2400:3200])
validSet3 = pd.DataFrame(allData3[2400:3200])
testSet1 = pd.DataFrame(allData1[3200:4000])
testSet2 = pd.DataFrame(allData2[3200:4000])
testSet3 = pd.DataFrame(allData3[3200:4000])
# Concatenate data from the 3 Gaussian datasets
allData = np.concatenate((pd.DataFrame(allData1), pd.DataFrame(allData2), pd.DataFrame(allData3)), axis=0)
trainSet = pd.DataFrame(np.concatenate((trainSet1, trainSet2, trainSet3), axis=0))
validSet = pd.DataFrame(np.concatenate((validSet1, validSet2, validSet3), axis=0))
testSet = pd.DataFrame(np.concatenate((testSet1, testSet2, testSet3), axis=0))

# Save completed datasets
pd.DataFrame(allData).to_csv("dataGenerated/DS2_data.csv", index=False, header=False)
trainSet.to_csv("dataGenerated/DS2_train.csv", index=False, header=False)
validSet.to_csv("dataGenerated/DS2_valid.csv", index=False, header=False)
testSet.to_csv("dataGenerated/DS2_test.csv", index=False, header=False)

In [7]:
# Split training data
trainSet0 = trainSet[trainSet[20] == 0]
trainSet1 = trainSet[trainSet[20] == 1]
testOut = testSet[20]
# Drop end column
trainSet0.drop([20], axis=1, inplace=True)
trainSet1.drop([20], axis=1, inplace=True)
trainSet.drop([20], axis=1, inplace=True)
testSet.drop([20], axis=1, inplace=True)

# Get length of each training set
numData0 = len(trainSet0)
numData1 = len(trainSet1)
# Create nd.array versions of data sets
trainSet_arr = trainSet.as_matrix()
testSet_arr = testSet.as_matrix()

# Get probability of each training set
allDataPts = numData0 + numData1
prob0 = numData0 / allDataPts
prob1 = numData1 / allDataPts

# Get mean of each column
mean0 = np.array(trainSet0.mean())
mean1 = np.array(trainSet1.mean())

# Compute covariance matrices to get w0 & w1
cov0 = np.array(trainSet0-mean0)
cov1 = np.array(trainSet1-mean1)
s0 = np.matmul(cov0.T, cov0)
s1 = np.matmul(cov1.T, cov1)
cov = (s0 + s1) / allDataPts
cov_inv = np.linalg.inv(cov)
w0 = (math.log(prob0)-math.log(prob1)) - (np.matmul(np.matmul(mean0.T, cov_inv), mean0) - np.matmul(np.matmul(mean1.T, cov_inv), mean1))/2
w1 = np.matmul(cov_inv, (mean0 - mean1))
print("Covariance matrix: ", cov, "\n")
print("w0: ", w0)
print("w1: ", w1, "\n")

# Compute GDA model
outPredict = np.matmul(testSet, w1.T) + w0
sigmoid = np.exp(outPredict) / (np.exp(outPredict) + 1)
for i in range(len(sigmoid)):
    if (sigmoid[i] > 0.5):
        sigmoid[i] = 0
    else:
        sigmoid[i] = 1
        
# Compute confusion matrix values
tp, tn, fp, fn = 0, 0, 0, 0
for j in range(len(testOut)):
    predVal = sigmoid[j]
    actualVal = testOut[j]

    # Compare actual to predicted values
    if (predVal == 1 and predVal == actualVal):
        tp += 1
    elif (predVal == 1 and predVal != actualVal):
        fp += 1
    elif (predVal == 0 and predVal == actualVal):
        tn += 1
    elif (predVal == 0 and predVal != actualVal):
        fn += 1
        
# Create confusion matrix
confMatrix = [[0, 0], [0, 0]]
confMatrix[0][0] = tp
confMatrix[0][1] = fp
confMatrix[1][0] = fn
confMatrix[1][1] = tn
        
# Compute results
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_measure = (2 * (precision * recall)) / (precision + recall)

# Display results
print("Confusion Matrix: ", confMatrix, "\n")
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F-measure: ", f_measure)

Covariance matrix:  [[ 7.50122388  5.57884754  4.95056876  5.0348809   4.39033786  6.05880853
   5.5345607   5.29956532  4.72478605  5.42966928  5.30664251  5.26026878
   5.3794607   6.12062415  5.35040496  5.78854079  5.47401514  5.20974579
   5.16666827  5.69803602]
 [ 5.57884754  7.52287489  5.31441971  5.3159981   5.17228527  6.53063054
   6.24403652  5.54008758  4.92958021  5.44818885  5.11266988  5.5948526
   5.30541587  6.18758946  5.78626593  6.11887465  5.68396922  5.17450143
   5.77658978  5.9880496 ]
 [ 4.95056876  5.31441971  6.79319325  5.05209305  4.85460698  5.47044114
   5.64954053  4.56577658  4.8072797   5.04832627  4.82390034  5.13027899
   5.3220334   6.03045462  5.07527386  5.06914785  5.46037641  4.44966546
   5.33842861  5.83273468]
 [ 5.0348809   5.3159981   5.05209305  6.40422881  4.15124126  5.65968514
   5.52861617  5.25959995  4.70467988  5.41220914  4.7432687   5.19971191
   4.93759861  5.33761305  5.24025098  5.22980256  5.17078493  4.974746
   5.29678964 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
# Store optimal values
k_opt = 0
confMatrix_opt = [[0, 0], [0, 0]]
accuracy_opt = 0.0

# Compute k-NN up to k=30
for k in range(1, 31):
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(testOut)):
        x = trainSet_arr
        y_i = testSet_arr[i]
        
        # Compute and sort distances
        distances = np.power(np.linalg.norm(x - y_i, axis=1), 2)
        neighbours = distances.argsort()[:k]
        if (trainSet_arr[neighbours, 19].sum() > 0):
            predVal = 1
        else:
            predVal = 0
        actualVal = testOut[i]
        
        # Compare actual to predicted values
        if (predVal == 1 and predVal == actualVal):
            tp += 1
        elif (predVal == 1 and predVal != actualVal):
            fp += 1
        elif (predVal == 0 and predVal == actualVal):
            tn += 1
        elif (predVal == 0 and predVal != actualVal):
            fn += 1
        
    # Compute and display accuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print("k: {}, Accuracy: {}".format(k, accuracy))
    
    # Update highest accuracy and store corresponding confusion matrix
    if (accuracy > accuracy_opt):
        accuracy_opt = accuracy
        k_opt = k
        confMatrix_opt[0][0] = tp
        confMatrix_opt[0][1] = fp
        confMatrix_opt[1][0] = fn
        confMatrix_opt[1][1] = tn
        
# Compute results from optimal values
tp = confMatrix_opt[0][0]
fp = confMatrix_opt[0][1]
fn = confMatrix_opt[1][0]
tn = confMatrix_opt[1][1]
precision_opt = tp / (tp + fp)
recall_opt = tp / (tp + fn)
f_measure_opt = (2 * (precision_opt * recall_opt)) / (precision_opt + recall_opt)

# Display results
print("\nOptimal k value: ", k_opt)
print("Confusion Matrix: ", confMatrix_opt, "\n")
print("Accuracy: ", accuracy_opt)
print("Precision: ", precision_opt)
print("Recall: ", recall_opt)
print("F-measure: ", f_measure_opt)

k: 1, Accuracy: 0.46625
k: 2, Accuracy: 0.47
k: 3, Accuracy: 0.46458333333333335
k: 4, Accuracy: 0.4683333333333333
k: 5, Accuracy: 0.4608333333333333
k: 6, Accuracy: 0.4633333333333333
k: 7, Accuracy: 0.46416666666666667
k: 8, Accuracy: 0.4625
k: 9, Accuracy: 0.4658333333333333
k: 10, Accuracy: 0.46291666666666664
k: 11, Accuracy: 0.465
k: 12, Accuracy: 0.4633333333333333
k: 13, Accuracy: 0.46375
k: 14, Accuracy: 0.46416666666666667
k: 15, Accuracy: 0.46291666666666664
k: 16, Accuracy: 0.45958333333333334
k: 17, Accuracy: 0.46041666666666664
k: 18, Accuracy: 0.46041666666666664
k: 19, Accuracy: 0.4608333333333333
k: 20, Accuracy: 0.46125
k: 21, Accuracy: 0.4625
k: 22, Accuracy: 0.4633333333333333
k: 23, Accuracy: 0.46166666666666667
k: 24, Accuracy: 0.4633333333333333
k: 25, Accuracy: 0.46291666666666664
k: 26, Accuracy: 0.46375
k: 27, Accuracy: 0.46375
k: 28, Accuracy: 0.4608333333333333
k: 29, Accuracy: 0.46208333333333335
k: 30, Accuracy: 0.46458333333333335

Optimal k value:  2
Co