In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from random import sample


In [2]:
def readCsv(fileName):
    fullFileName = r'C:\Users\Owner\McGill\4thYear\COMP551\Assignments\Assignment2\Datasets\DS' + fileName
    df = pd.read_csv(fullFileName, encoding='utf-8', header = None,
                 comment='#', sep=',')
    
    return df

In [3]:
def createData():
    df = readCsv('1_Cov.csv')
    meanPositive = readCsv('1_m_1.csv').dropna(axis=1, how='all').as_matrix()
    meanNegative = readCsv('1_m_0.csv').dropna(axis=1, how='all').as_matrix()
    meanPositive = meanPositive[0,:]
    meanNegative = meanNegative[0,:]
    
    cov = df.dropna(axis=1, how='all').as_matrix()
    
    distributionPositive = np.random.multivariate_normal(meanPositive, cov, 2000)
    distributionNegative = np.random.multivariate_normal(meanNegative, cov, 2000)
    
    distributionPositive = np.c_[distributionPositive, np.ones(distributionPositive.shape[0])]
    distributionNegative = np.c_[distributionNegative, np.zeros(distributionNegative.shape[0])]

    distribution = np.concatenate((distributionPositive, distributionNegative), axis=0)
    np.random.shuffle(distribution)
    distributionTest = distribution[0:1200]
    distributionTrain = distribution[1200:] 

    return (distribution, distributionTest, distributionTrain)



In [4]:
def toCsv(fileName, npArray):
    
    fullFileName = r'C:\Users\Owner\McGill\4thYear\COMP551\Assignments\Assignment2\Datasets\DS' + fileName
    np.savetxt(fullFileName, npArray, delimiter=',')


In [5]:
def initializeDS1():
    fullDistribution, distributionTest, distributionTrain = createData()
    toCsv('1_train.csv', distributionTrain)
    toCsv('1_test.csv', distributionTest)
    toCsv('1.csv', fullDistribution)
# initializeDS1()


In [6]:
def visualizeStuff():
    mean = [10, 10, 10]
    cov = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] 
    x, y, z = np.random.multivariate_normal(mean, cov, 10000).T

    fig = plt.figure(figsize=(20,8))
    ax = fig.add_subplot(111, projection='3d')

    plt.scatter(x, y, z, c='red')
    plt.axis('equal')
    plt.show()
# visualizeStuff()

In [7]:
def runLDA(dfTrain):
    N, N1, N2, pi, U1, U2 = np.zeros(6)
    Y = dfTrain[20].as_matrix()
    dfTrain = dfTrain.iloc[:,0:20]
    S = np.zeros((20,20))
    S1 = np.zeros((20,20))
    S2 = np.zeros((20,20))
    N = len(dfTrain)
    N1 = np.count_nonzero(Y)
    N2 = N - N1
    pi = N1/N
    
    for i in range(0, len(dfTrain)):
        U1 = U1 + Y[i]*dfTrain.iloc[i,:].as_matrix()
        U2 = U2 + (1-Y[i])*dfTrain.iloc[i,:].as_matrix()
    U1 = U1/N1
    U2 = U2/N2
    
    for i in range(0, len(dfTrain)):
        xSubMean = np.zeros(20)
        if (Y[i]==1):
            xSubMean = np.subtract(dfTrain.iloc[i,:].as_matrix(), U1)   
            S1 = S1 + np.outer(xSubMean, xSubMean)
        elif (Y[i]==0):
            xSubMean = np.subtract(dfTrain.iloc[i,:].as_matrix(), U2)
            S2 = S2 + np.outer(xSubMean, xSubMean)
        
    S = np.add(S1, S2)
    S = np.divide(S, N)
    print('U1', U1)
    print('U2', U2)
    print('pi', pi)
    print('S', S)
    
    return (U1, U2, pi, S)

In [8]:
def linearDecisionBoundary(dfTrain):
    U1, U2, pi, S = runLDA(dfTrain)
    invS = np.linalg.inv(S)
    w = np.dot(invS, np.subtract(U1, U2))
    w0 = 0
    w0 = np.divide(np.dot(U1.T, np.dot(invS, U1)), -2)
    w0 = w0 + np.divide(np.dot(U2.T, np.dot(invS, U2)), 2)
    w0 = w0 + np.log(pi/(1-pi))
    
    print('w', w)
    print('w0', w0)
    return (w, w0)

In [9]:
def testData(dfTrain, dfTest):
    w, w0 = linearDecisionBoundary(dfTrain)
    Y = dfTest[20].as_matrix()
    dfTest = dfTest.iloc[:,0:20]
    classifier = 0
    truePositive, falsePositive, falseNegative, trueNegative = np.zeros(4)
    for i in range(0, len(dfTest)):
        probability = np.dot(w, dfTest.iloc[i,:].as_matrix()) + w0
        if probability>=0:
            if Y[i] == 1:
                truePositive+=1
            elif Y[i] == 0:
                falsePositive+=1
                
        elif probability<0:
            if Y[i] == 1:
                falseNegative+=1
            elif Y[i] == 0:
                trueNegative+=1

    precision = truePositive/(truePositive+falsePositive)
    recall = truePositive/(truePositive+falseNegative)
    fMeasure = 2*precision*recall/(precision+recall)
    accuracy = (truePositive+trueNegative)/(truePositive+trueNegative+falsePositive+falseNegative)
    
    print('Accuracy', accuracy)
    print('Precision', precision)
    print('Recall', recall)
    print('F-Measure', fMeasure)


In [10]:
testData(readCsv('1_train.csv'), readCsv('1_test.csv'))

U1 [ 1.92486305  1.88574537  1.91903303  1.9023876   1.87220534  1.92675551
  1.90384851  1.93631247  1.92383374  1.88307188  1.93366121  1.8939727
  1.8565985   1.92555162  1.90964703  1.86961943  1.91901221  1.90444018
  1.92062611  1.86677443]
U2 [ 1.2578635   1.33369734  1.22781812  1.24713771  1.2745293   1.2742541
  1.29876692  1.24321133  1.23592936  1.29741145  1.26109714  1.26499492
  1.27306806  1.24368497  1.26658048  1.24846667  1.20458703  1.26859286
  1.28606356  1.29473774]
pi 0.5
S [[ 8.0615456   5.54023354  6.33323801  5.24674662  6.02534845  6.29813792
   4.80638065  5.56427935  5.08785888  5.306586    4.06744829  5.34554859
   7.19348875  6.16442614  6.23079925  6.14010753  5.9777847   5.79110427
   5.75451037  6.05636299]
 [ 5.54023354  6.82704627  5.45586708  4.35102419  5.54560142  5.62241695
   4.38019284  3.99700092  4.17439172  5.0667105   3.42920118  4.63308771
   5.91255215  5.22008792  5.51742136  5.27390571  5.68707093  5.15873333
   5.42499206  5.36940271]

In [11]:
testData(readCsv('2_train.csv'), readCsv('2_test.csv'))

U1 [ 0.93764887  0.95352548  0.98280782  0.96581479  0.9405326   0.98589368
  0.99037942  0.94295805  1.01208443  0.95809837  0.99408319  0.94203745
  0.99270284  0.93522284  0.97377322  0.96623913  0.99524936  0.88053059
  0.96744888  1.00763805]
U2 [ 1.27801399  1.26026459  1.25649807  1.1953158   1.18132912  1.25296546
  1.27737524  1.23315997  1.29595725  1.24760954  1.28463662  1.23443485
  1.25919227  1.30971448  1.20792882  1.25958999  1.23267725  1.18948249
  1.2307232   1.20313014]
pi 0.4932142857142857
S [[ 7.74324131  5.53826411  4.80422318  5.19106882  4.38149587  5.77851032
   5.96042789  5.77341115  4.80976661  5.61645192  5.56964799  5.12133164
   5.11325932  6.22405301  5.54769895  5.74641815  5.5907825   5.47768377
   5.82994051  6.05508882]
 [ 5.53826411  7.40642097  5.15966791  5.42794621  5.15655824  6.23381724
   6.6674432   5.59808488  4.8620397   5.39814389  4.99027058  5.26934222
   5.14625018  6.38510965  5.9482866   6.12075168  5.52195867  5.21411255
   6.4572