In [19]:
import scipy.stats
from sklearn.decomposition import PCA
import pickle
import os

In [20]:
from data_helpers.wine_quality_data_helper import load_wine_quality_data
from data_helpers.mnist_data_helper import load_mnist_data

In [21]:
class PCAWrapper:
    NAME = 'PCA'
    def __init__(self, n_components) -> None:
        self.model = PCA(n_components=n_components)
    
    def fit(self, X):
        return self.model.fit(X)
    
    def fit_transform(self, X):
        return self.model.fit_transform(X)

    def transform(self, X):
        return self.model.transform(X)
    
    def get_kurtosis(self, X):
        X_star = self.model.fit_transform(X)
        kurtosis = scipy.stats.kurtosis(X_star).mean()
        return kurtosis

In [22]:
DATASET_NAME = 'mnist'
DATASET_STR = 'MNIST'

# DATASET_NAME = 'wine_quality'
# DATASET_STR = 'Wine Quality'

METHOD = PCAWrapper
K_FOLDS = 5
SAVE = True

In [23]:
if DATASET_NAME == 'wine_quality':
    X_train, y_train, X_test, y_test = load_wine_quality_data()
elif DATASET_NAME == 'mnist':
    X_train, y_train, X_test, y_test = load_mnist_data()
else:
    raise ValueError(f'Unknown dataset: {DATASET_NAME}')

In [24]:
if DATASET_NAME == 'wine_quality':
    components = 4
elif DATASET_NAME == 'mnist':
    components = 35

In [25]:
model = METHOD(components)
X_train_star = model.fit_transform(X_train)
X_test_star = model.transform(X_test)

In [26]:
dir = f"transformed_data/{DATASET_NAME}/{METHOD.NAME}"
os.makedirs(dir, exist_ok=True)
pickle.dump(X_train_star, open(f"{dir}/X_train.pkl", "wb"))
pickle.dump(X_test_star, open(f"{dir}/X_test.pkl", "wb"))
pickle.dump(y_train, open(f"{dir}/y_train.pkl", "wb"))
pickle.dump(y_test, open(f"{dir}/y_test.pkl", "wb"))

In [27]:
X_test.shape, X_test_star.shape

((3147, 196), (3147, 35))