In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import matplotlib.pyplot as plt

import scipy.stats
from sklearn.decomposition import FastICA, PCA
from sklearn.manifold import TSNE
import numpy as np
from sklearn.random_projection import GaussianRandomProjection

In [3]:
from data_helpers.wine_quality_data_helper import load_wine_quality_data
from data_helpers.mnist_data_helper import load_mnist_data



In [4]:
class ICAWrapper:
    NAME = 'ICA'
    def __init__(self, n_components) -> None:
        self.model = FastICA(n_components=n_components, max_iter=1000)
    
    def fit(self, X):
        return self.model.fit(X)
    
    def fit_transform(self, X):
        return self.model.fit_transform(X)
    
    def transform(self, X):
        return self.model.transform(X)
    
    def get_avg_kurtosis(self, X):
        X_star = self.model.transform(X)
        # k = scipy.stats.kurtosis(X)
        # k = k[k != np.inf]
        # k = k[~np.isnan(k)]
        kurtosis = scipy.stats.kurtosis(X_star).mean()
        return kurtosis

    def get_kurtosis_list(self, X):
        X_star = self.model.transform(X)
        kurtosis = scipy.stats.kurtosis(X_star)
        return kurtosis

In [5]:
# DATASET_NAME = 'wine_quality'
DATASET_NAME = 'mnist'

# n = 5000
# X = X[:n]
# y = y[:n]

METHOD = ICAWrapper
K_FOLDS = 5
SAVE = True

In [6]:
if DATASET_NAME == 'wine_quality':
    X, y, _, _ = load_wine_quality_data()
elif DATASET_NAME == 'mnist':
    X, y, _, _ = load_mnist_data()
else:
    raise ValueError(f'Invalid dataset name {DATASET_NAME}')

In [7]:
model = FastICA(n_components=196, max_iter=1000)
model.fit(X[:1000])
X_star = model.transform(X[1000:2000])
kurt = model.kurtosis(X_star)



AttributeError: 'FastICA' object has no attribute 'kurtosis'

In [None]:
kurt.shape

In [None]:
plt.hist(kurt, bins=100)

In [6]:
df_path = f'results/{DATASET_NAME}/{METHOD.NAME}_metrics.csv'
if os.path.exists(df_path):
    df = pd.read_csv(df_path)
    df.set_index('num_components', inplace=True)
    df['num_components'] = df.index
else:
    df = pd.DataFrame()

In [7]:
n = X.shape[0] // K_FOLDS

for components in tqdm(range(2, 12)):
# for components in tqdm(range(2, 201)):
# for components in tqdm([250, 300, 350, 400, 450, 500]):
    print(components)
    training_time_list = []
    evaluation_time_list = []
    kurtosis_list = []
    reconstruction_error_list = []
    for i in range(K_FOLDS):
        X_train = np.concatenate([X[:i * n], X[(i + 1) * n:]])
        y_train = np.concatenate([y[:i * n], y[(i + 1) * n:]])
        X_test = X[i * n:(i + 1) * n]
        y_test = y[i * n:(i + 1) * n]

        model = METHOD(components)
        t0 = time.perf_counter()
        model.fit(X_train)
        t1 = time.perf_counter()
        training_time_list.append(t1 - t0)

        t0 = time.perf_counter()
        _ = model.transform(X_test) 
        t1 = time.perf_counter()
        evaluation_time_list.append(t1 - t0)

        if METHOD == ICAWrapper:
            kurtosis = model.get_kurtosis(X_test)
            kurtosis_list.append(kurtosis)

        if METHOD == RPWrapper:
            error = model.get_reconstruction_error(X_test)
            reconstruction_error_list.append(error)

    df.loc[components, 'num_components'] = components
    df.loc[components, 'training_time_mean'] = np.mean(training_time_list)
    df.loc[components, 'training_time_std'] = np.std(training_time_list)
    df.loc[components, 'evaluation_time_mean'] = np.mean(evaluation_time_list)
    df.loc[components, 'evaluation_time_std'] = np.std(evaluation_time_list)
    if METHOD == RPWrapper:
        df.loc[components, 'reconstruction_error_mean'] = np.mean(reconstruction_error_list)
        df.loc[components, 'reconstruction_error_std'] = np.std(reconstruction_error_list)
    if METHOD == ICAWrapper:
        df.loc[components, 'kurtosis_mean'] = np.mean(kurtosis_list)
        df.loc[components, 'kurtosis_std'] = np.std(kurtosis_list)

 10%|█         | 1/10 [00:00<00:01,  5.22it/s]

2
3


 30%|███       | 3/10 [00:00<00:00,  9.01it/s]

4
5


 60%|██████    | 6/10 [00:00<00:00,  9.19it/s]

6
7
8


 70%|███████   | 7/10 [00:02<00:01,  1.54it/s]

9


 80%|████████  | 8/10 [00:05<00:02,  1.05s/it]

10


 90%|█████████ | 9/10 [00:10<00:02,  2.20s/it]

11


100%|██████████| 10/10 [00:22<00:00,  2.25s/it]


In [8]:
if SAVE:
    os.makedirs(os.path.dirname(df_path), exist_ok=True)
    df.to_csv(df_path, index=False)