In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time

import scipy.stats
from sklearn.decomposition import PCA

In [None]:
from data_helpers.wine_quality_data_helper import load_wine_quality_data
from data_helpers.mnist_data_helper import load_mnist_data

In [None]:
class PCAWrapper:
    NAME = 'PCA'
    def __init__(self, n_components) -> None:
        self.model = PCA(n_components=n_components)
    
    def fit(self, X):
        return self.model.fit(X)
    
    def fit_transform(self, X):
        return self.model.fit_transform(X)

    def transform(self, X):
        return self.model.transform(X)
    
    def get_kurtosis(self, X):
        X_star = self.model.fit_transform(X)
        kurtosis = scipy.stats.kurtosis(X_star).mean()
        return kurtosis

In [None]:
# DATASET_NAME = 'wine_quality'
# DATASET_STR = 'Wine Quality'

DATASET_NAME = 'mnist'
DATASET_STR = 'MNIST'

METHOD = PCAWrapper
K_FOLDS = 5
SAVE = True

In [None]:
if DATASET_NAME == 'wine_quality':
    X, y, _, _ = load_wine_quality_data()
elif DATASET_NAME == 'mnist':
    X, y, _, _ = load_mnist_data()
else:
    raise ValueError(f'Unknown dataset: {DATASET_NAME}')

In [None]:
n = X.shape[0] // K_FOLDS
i = 0
# components = 50
components = 196
# components = 11

X_train = np.concatenate([X[:i * n], X[(i + 1) * n:]])
y_train = np.concatenate([y[:i * n], y[(i + 1) * n:]])
X_test = X[i * n:(i + 1) * n]
y_test = y[i * n:(i + 1) * n]

model = METHOD(components)
t0 = time.perf_counter()
model.fit(X_train)
t1 = time.perf_counter()
training_time = t1 - t0

t0 = time.perf_counter()
_ = model.transform(X_test) 
t1 = time.perf_counter()
evaluation_time = t1 - t0

kurtosis = model.get_kurtosis(X_test)

In [None]:
plt.figure(figsize=(6, 4))
n_samples = X.shape[0]
# We center the data and compute the sample covariance matrix.
X -= np.mean(X, axis=0)
cov_matrix = np.dot(X.T, X) / n_samples
values = []
for eigenvector in model.model.components_:
    values.append(np.dot(eigenvector.T, np.dot(cov_matrix, eigenvector)))
plt.bar(range(components), values)
plt.xlim([-1, 100])
plt.ylabel('Explained Variance', fontsize=12)
plt.xlabel('Num Components', fontsize=12)
plt.title(f'PCA - {DATASET_STR} - Explained Variance', fontsize=12)

In [None]:
model.model.explained_variance_ratio_.shape

In [None]:
plt.bar(range(model.model.explained_variance_.shape[0]), model.model.explained_variance_)
plt.xlabel('Num Components')
plt.ylabel('Explained Variance')
plt.title(f'{METHOD.NAME} - {DATASET_STR} - Explained Variance by Component')
plt.savefig(f"figures/{DATASET_NAME}_{METHOD.NAME}_explained_variance.png")

In [None]:
values