In [36]:
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import pickle
import os
import numpy as np

In [37]:
from data_helpers.wine_quality_data_helper import load_wine_quality_data
from data_helpers.mnist_data_helper import load_mnist_data

In [38]:
class GaussianMixtureWrapper:
    NAME = 'GMM'
    def __init__(self, n_components):
        self.n_components = n_components
        self.model = GaussianMixture(n_components=n_components)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)

class KMeansWrapper:
    NAME = 'KMeans'
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.model = KMeans(n_clusters=n_clusters)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)

In [39]:
DATASET_NAME = 'mnist'
# DATASET_NAME = 'wine_quality'

# METHOD = KMeansWrapper
METHOD = GaussianMixtureWrapper

In [40]:
if DATASET_NAME == 'wine_quality':
    X_train, y_train, X_test, y_test = load_wine_quality_data()
elif DATASET_NAME == 'mnist':
    X_train, y_train, X_test, y_test = load_mnist_data()
else:
    raise ValueError(f'Unknown dataset: {DATASET_NAME}')

In [41]:
if METHOD.NAME == 'KMeans':
    if DATASET_NAME == 'mnist':
        n_components = 30
    elif DATASET_NAME == 'wine_quality':
        n_components = 20
    else:
        raise ValueError(f'Unknown dataset: {DATASET_NAME}')
elif METHOD.NAME == 'GMM':
    if DATASET_NAME == 'mnist':
        n_components = 18
    elif DATASET_NAME == 'wine_quality':
        n_components = 5
    else:
        raise ValueError(f'Unknown dataset: {DATASET_NAME}')
else:
    raise ValueError(f'Unknown method: {METHOD.NAME}')

In [42]:
print(n_components)

18


In [43]:
model = METHOD(n_components)
model.fit(X_train)

In [44]:
if METHOD == GaussianMixtureWrapper:
    X_train_star = model.model.predict_proba(X_train)
    X_test_star = model.model.predict_proba(X_test)
elif METHOD == KMeansWrapper:
    def get_distances(X, model):
        distances = []
        for x in X:
            distance_vector = []
            for center in model.model.cluster_centers_:
                distance_vector.append(np.linalg.norm(x - center))
            distances.append(distance_vector)
        return np.array(distances)
    X_train_star = get_distances(X_train, model)
    X_test_star = get_distances(X_test, model)

In [45]:
X_train_star = np.hstack((X_train, X_train_star))
X_test_star = np.hstack((X_test, X_test_star))

In [46]:
dir = f"transformed_data/step_5/{DATASET_NAME}/{METHOD.NAME}"
os.makedirs(dir, exist_ok=True)
pickle.dump(X_train_star, open(f"{dir}/X_train.pkl", "wb"))
pickle.dump(X_test_star, open(f"{dir}/X_test.pkl", "wb"))
pickle.dump(y_train, open(f"{dir}/y_train.pkl", "wb"))
pickle.dump(y_test, open(f"{dir}/y_test.pkl", "wb"))

In [47]:
X_test.shape, X_test_star.shape

((3147, 196), (3147, 214))