In [35]:
import numpy as np

from copy import deepcopy

from matplotlib import pyplot as plt

from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EmpiricalCovariance

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

In [4]:
def pca_vs_lr_dtc(xtrain, ytrain, xtest, ytest, d, pca_cls=PCA, dump_out=True):
    pca = pca_cls(n_components=d)
    xptrain = pca.fit_transform(xtrain)
    xptest = pca.transform(xtest)
    # LR accuracy
    lr = LogisticRegression()
    lr.fit(xptrain, ytrain)
    lrscore = lr.score(xptest, ytest)
    # DTC accuracy
    dtc = DecisionTreeClassifier()
    dtc.fit(xptrain, ytrain)
    dtcscore = dtc.score(xptest, ytest)
    if dump_out:
        print('D = {}'.format(d))
        print('LR accuracy: {}'.format(lrscore))
        print('DTC accuracy: {}'.format(dtcscore))
    else:
        return lrscore, dtcscore

In [55]:
class PCAImpl:
    
    def __init__(self, n_components):
        self.n_components = n_components
        
    def scale_data(self, X):
        x = deepcopy(X)
        x = x.astype('float64')
        _stdscaler = StandardScaler()
        x = _stdscaler.fit_transform(x)
        return x
    
    def fit(self, X):
        x = self.scale_data(X)
        self.scaled_x = x
        # compute the covariance and it's eigen values and vectors
        empcov = EmpiricalCovariance(assume_centered=True)
        cov = empcov.fit(x).covariance_
        e_val, e_vec = np.linalg.eig(cov)
        self.eigen_values = e_val
        self.eigen_vectors = e_vec
    
    def transform(self, X):
        if X is None:
            x = self.scaled_x
        else:
            x = self.scale_data(X)
        _indices = np.argsort(self.eigen_values)[::-1]
        feature_vectors = self.eigen_vectors[:,_indices][:,:self.n_components] # column vectors
        return np.dot(x, feature_vectors)
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(None)
        

In [7]:
mnist = fetch_mldata('MNIST original', data_home='../Data')
mxtrain, mxtest, mytrain, mytest = train_test_split(mnist.data, mnist.target, test_size=0.33, random_state=42)

In [57]:
print('Self implementation:')
pca_vs_lr_dtc(mxtrain, mytrain, mxtest, mytest, 5, pca_cls=PCAImpl)
print('Scikit implementation:')
pca_vs_lr_dtc(mxtrain, mytrain, mxtest, mytest, 5, pca_cls=PCA)

Self implementation:
D = 5
LR accuracy: 0.6417748917748918
DTC accuracy: 0.6708658008658008
Scikit implementation:
D = 5
LR accuracy: 0.6483116883116883
DTC accuracy: 0.6606060606060606


In [58]:
print('Self implementation:')
pca_vs_lr_dtc(mxtrain, mytrain, mxtest, mytest, 20, pca_cls=PCAImpl)
print('Scikit implementation:')
pca_vs_lr_dtc(mxtrain, mytrain, mxtest, mytest, 20, pca_cls=PCA)

Self implementation:
D = 20
LR accuracy: 0.8511255411255412
DTC accuracy: 0.8344155844155844
Scikit implementation:
D = 20
LR accuracy: 0.8631168831168832
DTC accuracy: 0.842987012987013
