In this notebook, noleave and normal data would be used for training and testing purposes

# Load data

In [None]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import linalg
import matplotlib.pyplot as plt


dataset=np.load('AllSetSparseInOut_noleave_N.npz')
TrainX=np.asmatrix(dataset['TrainX'])[0,0]
ValidX=np.asmatrix(dataset['ValidX'])[0,0]
TestX=np.asmatrix(dataset['TestX'])[0,0]
TrainY=np.asmatrix(dataset['TrainY'])[0,0]
ValidY=np.asmatrix(dataset['ValidY'])[0,0]
TestY=np.asmatrix(dataset['TestY'])[0,0]

# SVM with poly

In [None]:
from sklearn import svm
from sklearn.ensemble import BaggingClassifier
TrainY = np.array(TrainY.todense()).ravel()
ValidY = np.array(ValidY.todense()).ravel()
n_estimators = 20
clf = BaggingClassifier(svm.SVC(kernel='poly',degree=2), max_samples=1.0 / n_estimators, n_estimators=n_estimators,n_jobs=n_estimators)
clf.fit(TrainX, TrainY)
valid_score = clf.score(ValidX,ValidY) 
train_score = clf.score(TrainX,ValidY) 

validation score: 0.5395456542996463  
training score: 

# GDA

## When trained with original dataset (with leavers)

- the partial training data (random 10,000) used: training accuracy over 2,000 is 0.657

## When trained with noleave dataset

- the partial training data (random 10,000) used: training accuracy over 2,000 is 0.653

# GDA code is provided as below

- Load data

In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
import time

from scipy import sparse
from sklearn import svm

from sklearn.ensemble import BaggingClassifier
from sklearn.decomposition import PCA
from scipy import sparse
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler


dataset=np.load('AllSetSparseInOut_noleave_N.npz')
trainX=np.asmatrix(dataset['TrainX'])[0,0]
validX=np.asmatrix(dataset['ValidX'])[0,0]
testX=np.asmatrix(dataset['TestX'])[0,0]
TrainY=np.asmatrix(dataset['TrainY'])[0,0]
ValidY=np.asmatrix(dataset['ValidY'])[0,0]
TestY=np.asmatrix(dataset['TestY'])[0,0]


- Transfer data into proper format

In [None]:
trainY = np.array(TrainY.todense()).ravel()
validY = np.array(ValidY.todense()).ravel()
testY = np.array(TestY.todense()).ravel()

- Compute mean and coverances for GDA and calculate likelihood

In [None]:
def compute_mean_mles(train_data, train_labels):
    '''
    Compute the mean estimate for each digit class

    Should return a numpy array of size (10,64)
    The ith row will correspond to the mean estimate for digit class i
    '''
    means = np.zeros((2, train_data.shape[1]))
    # Compute means
    for i in range(means.shape[0]): # loop for each class
        means[i] = np.mean(train_data[train_labels==i],axis=0)
    return means

def compute_sigma_mles(train_data, train_labels):
    '''
    Compute the covariance estimate for each digit class

    Should return a three dimensional numpy array of shape (10, 64, 64)
    consisting of a covariance matrix for each digit class 
    '''
    covariances = np.zeros((2, train_data.shape[1], train_data.shape[1]))
    # Compute covariances
    means = compute_mean_mles(train_data, train_labels)
    for i in range(covariances.shape[0]): # loop for each class
        trainX = train_data[train_labels==i]
        expe = trainX - means[i]
        covariances[i] = 1 / trainX.shape[0] * np.matmul(expe.T, expe) + 0.01*np.identity(100)
    return covariances

def generative_likelihood(digits, means, covariances):
    '''
    Compute the generative log-likelihood:
        log p(x|y,mu,Sigma)

    Should return an n x 10 numpy array 
    '''
    classlikelihood = np.zeros((digits.shape[0],2))
    n = digits.shape[0]
    d = digits.shape[1]
    for i in range(n):
        for j in range(2):
            classlikelihood[i,j] = -d/2 * np.log(2*np.pi) - 1/2*np.log(np.linalg.det(covariances[j])) - 1/2*np.matmul(
                np.matmul(
                    digits[i]-means[j],np.linalg.inv(covariances[j])
                    )
                ,(digits[i]-means[j]).T
                    )
        print('class likelihood for the %dth data is %s' % (i,classlikelihood[i]))
    return classlikelihood

def conditional_likelihood(digits, means, covariances):
    '''
    Compute the conditional likelihood:

        log p(y|x, mu, Sigma)

    This should be a numpy array of shape (n, 10)
    Where n is the number of datapoints and 10 corresponds to each digit class
    '''
    classlikelihood = generative_likelihood(digits, means, covariances)
    posterior = 1/2*classlikelihood
    return posterior


def classify_data(digits, means, covariances):
    '''
    Classify new points by taking the most likely posterior class
    '''
    cond_likelihood = conditional_likelihood(digits, means, covariances)
    # Compute and return the most likely class
    res = [np.argmax(cond_likelihood[i]) for i in range(cond_likelihood.shape[0])]
    return res

0. Prepare subset of training data

In [None]:
n_samples = 100000
indices = random.sample(range(TrainX.shape[0]), n_samples)
trainX_sub = TrainX[indices]
trainY_sub = trainY[indices]

n_samples = 100000
indices = random.sample(range(ValidX.shape[0]), n_samples)
validX_sub = ValidX[indices]
validY_sub = validY[indices]

n_samples = 100000
indices = random.sample(range(TestX.shape[0]), n_samples)
testX_sub = TestX[indices]
testY_sub = testY[indices]

1. Firstly compute the mean and covariances given training data

In [None]:
train_data = trainX
train_labels = trainY
means = compute_mean_mles(train_data, train_labels)
covariances = compute_sigma_mles(train_data, train_labels)

2. Then compute it by calling classify_data() and calculate accuracy thereafter

In [None]:
n_samples = 2000
indices = random.sample(range(trainX_sub.shape[0]), n_samples)
res_train = classify_data(trainX_sub[indices], means, covariances)
train_acc = sum(res_train==trainY_sub[indices])/len(res_train) 