In [7]:
%matplotlib inline
import numpy as np
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from random import sample


In [8]:
def readCsv(fileName):
    fullFileName = r'C:\Users\Owner\McGill\4thYear\COMP551\Assignments\Assignment2\Datasets\DS' + fileName
    df = pd.read_csv(fullFileName, encoding='utf-8', header = None,
                 comment='#', sep=',')
    
    return df

In [9]:
def createData():
    df = readCsv('1_Cov.csv')
    meanPositive = readCsv('1_m_1.csv').dropna(axis=1, how='all').as_matrix()
    meanNegative = readCsv('1_m_0.csv').dropna(axis=1, how='all').as_matrix()
    meanPositive = meanPositive[0,:]
    meanNegative = meanNegative[0,:]
    
    cov = df.dropna(axis=1, how='all').as_matrix()
    
    distributionPositive = np.random.multivariate_normal(meanPositive, cov, 2000)
    distributionNegative = np.random.multivariate_normal(meanNegative, cov, 2000)
    
    distributionPositive = np.c_[distributionPositive, np.ones(distributionPositive.shape[0])]
    distributionNegative = np.c_[distributionNegative, np.zeros(distributionNegative.shape[0])]
#     print('Positive', distributionPositive)
#     print('Negative', distributionNegative)
    distribution = np.concatenate((distributionPositive, distributionNegative), axis=0)
    np.random.shuffle(distribution)
    distributionTest = distribution[0:1200]
    distributionTrain = distribution[1200:] 
    print(distribution)
    print(distributionTest)
    print(distributionTrain)
    print('original', distribution.shape)
    print('train', distributionTrain.shape)
    print('test', distributionTest.shape)
    return (distribution, distributionTest, distributionTrain)


In [10]:
def toCsv(fileName, npArray):
    
    fullFileName = r'C:\Users\Owner\McGill\4thYear\COMP551\Assignments\Assignment2\Datasets\DS' + fileName
    np.savetxt(fullFileName, npArray, delimiter=',')


In [77]:
def initializeDS1():
    fullDistribution, distributionTest, distributionTrain = createData()
    toCsv('1_train.csv', distributionTrain)
    toCsv('1_test.csv', distributionTest)
    toCsv('1.csv', fullDistribution)
# initializeDS1()


In [12]:
def visualizeStuff():
    mean = [10, 10, 10]
    cov = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] 
    x, y, z = np.random.multivariate_normal(mean, cov, 10000).T

    fig = plt.figure(figsize=(20,8))
    ax = fig.add_subplot(111, projection='3d')

    plt.scatter(x, y, z, c='red')
    plt.axis('equal')
    plt.show()
# visualizeStuff()

In [66]:
def runLDA():
    dfTrain = readCsv('1_train.csv')
    N, N1, N2, pi, U1, U2 = np.zeros(6)
    Y = dfTrain[20].as_matrix()
    dfTrain = dfTrain.iloc[:,0:20]
    S = np.zeros((20,20))
    S1 = np.zeros((20,20))
    S2 = np.zeros((20,20))
    N = len(dfTrain)
    N1 = np.count_nonzero(Y)
    N2 = N - N1
    pi = N1/N
    
    for i in range(0, len(dfTrain)):
        U1 = U1 + Y[i]*dfTrain.iloc[i,:].as_matrix()
        U2 = U2 + (1-Y[i])*dfTrain.iloc[i,:].as_matrix()
    U1 = U1/N1
    U2 = U2/N2
    
    for i in range(0, len(dfTrain)):
        xSubMean = np.zeros(20)
        if (Y[i]==1):
            xSubMean = np.subtract(dfTrain.iloc[i,:].as_matrix(), U1)   
            S1 = S1 + np.outer(xSubMean, xSubMean)
        elif (Y[i]==0):
            xSubMean = np.subtract(dfTrain.iloc[i,:].as_matrix(), U2)
            S2 = S2 + np.outer(xSubMean, xSubMean)
        
    S = np.add(S1, S2)
    S = np.divide(S, N)
    return (U1, U2, pi, S)

runLDA()


(array([ 1.92486305,  1.88574537,  1.91903303,  1.9023876 ,  1.87220534,
         1.92675551,  1.90384851,  1.93631247,  1.92383374,  1.88307188,
         1.93366121,  1.8939727 ,  1.8565985 ,  1.92555162,  1.90964703,
         1.86961943,  1.91901221,  1.90444018,  1.92062611,  1.86677443]),
 array([ 1.2578635 ,  1.33369734,  1.22781812,  1.24713771,  1.2745293 ,
         1.2742541 ,  1.29876692,  1.24321133,  1.23592936,  1.29741145,
         1.26109714,  1.26499492,  1.27306806,  1.24368497,  1.26658048,
         1.24846667,  1.20458703,  1.26859286,  1.28606356,  1.29473774]),
 0.5,
 array([[ 8.0615456 ,  5.54023354,  6.33323801,  5.24674662,  6.02534845,
          6.29813792,  4.80638065,  5.56427935,  5.08785888,  5.306586  ,
          4.06744829,  5.34554859,  7.19348875,  6.16442614,  6.23079925,
          6.14010753,  5.9777847 ,  5.79110427,  5.75451037,  6.05636299],
        [ 5.54023354,  6.82704627,  5.45586708,  4.35102419,  5.54560142,
          5.62241695,  4.38019284, 

In [67]:
def linearDecisionBoundary():
    U1, U2, pi, S = runLDA()
    invS = np.linalg.inv(S)
    w = np.dot(invS, np.subtract(U1, U2))
    w0 = 0
    w0 = np.divide(np.dot(U1.T, np.dot(invS, U1)), -2)
    w0 = w0 + np.divide(np.dot(U2.T, np.dot(invS, U2)), 2)
    w0 = w0 + np.log(pi/(1-pi))
    
#     print(w)
    return (w, w0)

linearDecisionBoundary()

(array([-14.88105119,   8.69068246,   5.56391428,   3.26425236,
          9.81126017,   4.62701308, -17.20608142,  24.84481984,
         29.87743031,  -9.49635768,  13.62003589,  12.47560219,
        -15.94523547, -13.28126465,   5.92634042, -13.48856649,
        -29.98969146,   7.04258343,   0.43809373,   5.12597768]),
 -28.079005846796129)

In [76]:
def testData():
    w, w0 = linearDecisionBoundary()
    dfTest = readCsv('1_test.csv')
    Y = dfTest[20].as_matrix()
    dfTest = dfTest.iloc[:,0:20]
    classifier = 0
    print(w0)
    print(w)
    truePositive, falsePositive, falseNegative = np.zeros(3)
    for i in range(0, len(dfTest)):
        probability = np.dot(w, dfTest.iloc[i,:].as_matrix()) + w0
        if probability>=0:
            if Y[i] == 1:
                truePositive+=1
            elif Y[i] == 0:
                falsePositive+=1
                
        elif probability<0 and Y[i] == 1:
            falseNegative+=1
        
    print('True Positive', truePositive)
    print('False Negative', falseNegative)
    print('False Positive', falsePositive)
    precision = truePositive/(truePositive+falsePositive)
    recall = truePositive/(truePositive+falseNegative)
    fMeasure = 2*precision*recall/(precision+recall)
    print('Precision', precision)
    print('Recall', recall)
    print('F-Measure', fMeasure)
testData()

-28.0790058468
[-14.88105119   8.69068246   5.56391428   3.26425236   9.81126017
   4.62701308 -17.20608142  24.84481984  29.87743031  -9.49635768
  13.62003589  12.47560219 -15.94523547 -13.28126465   5.92634042
 -13.48856649 -29.98969146   7.04258343   0.43809373   5.12597768]
True Positive 560.0
False Negative 40.0
False Positive 26.0
Precision 0.955631399317
Recall 0.933333333333
F-Measure 0.944350758853
