K-means clustering example

In [None]:
# prerequisites
%matplotlib inline
import sklearn
from sklearn.cluster import KMeans
from sklearn import datasets
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from torchvision.utils import save_image

import numpy as np
from tqdm.notebook import tqdm


# Hyperparameters

In [None]:
# json file name
experiment_json = f'../experiments/MNIST_torus_AEexp28.json'

violent_saving = True # if False it will not save plots
build_report = True

# Loading JSON file
import json
with open(experiment_json) as json_file:
    json_config = json.load(json_file)

print( json.dumps(json_config, indent=2 ) )

Path_experiments = json_config["Path_experiments"]
experiment_name = json_config["experiment_name"]
experiment_number = json_config["experiment_number"]
Path_pictures = json_config["Path_pictures"]

# # Number of workers in DataLoader
# num_workers = 10

In [None]:
set_name    = json_config["dataset"]["name"]
split_ratio = json_config["optimization_parameters"]["split_ratio"]
batch_size  = json_config["optimization_parameters"]["batch_size"]

# Dataset uploading 

In [None]:
# import sys
# sys.path.append('../') # have to go 1 level up
import ricci_regularization

In [None]:
if set_name == "MNIST":
    #MNIST_SIZE = 28
    # MNIST Dataset
    D = 784
    train_dataset = datasets.MNIST(root='../datasets/', train=True, transform=transforms.ToTensor(), download=True)
    test_dataset  = datasets.MNIST(root='../datasets/', train=False, transform=transforms.ToTensor(), download=False)
m = len(train_dataset)
train_data, test_data = torch.utils.data.random_split(train_dataset, [int(m-m*split_ratio), int(m*split_ratio)])

test_loader  = torch.utils.data.DataLoader(test_data , batch_size=batch_size)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

# VAE structure

In [None]:
latent_dim = json_config["architecture"]["latent_dim"]
input_dim  = json_config["architecture"]["input_dim"]
architecture_type = json_config["architecture"]["name"]

if architecture_type== "TorusAE":
    torus_ae   = ricci_regularization.Architectures.TorusAE(x_dim=input_dim, h_dim1= 512, h_dim2=256, z_dim=latent_dim)
elif architecture_type =="TorusConvAE":
    torus_ae   = ricci_regularization.Architectures.TorusConvAE(x_dim=input_dim, h_dim1= 512, h_dim2=256, z_dim=latent_dim,pixels=28)
if torch.cuda.is_available():
    torus_ae.cuda()

### Loading the saved weights

In [None]:
# NO! Use the path ../experiments/<Your experiment>/nn_weights/
PATH_ae_wights = json_config["weights_saved_at"]
torus_ae.load_state_dict(torch.load(PATH_ae_wights))
torus_ae.eval()

In [None]:
# borrowed from https://gist.github.com/jakevdp/91077b0cae40f8f8244a
def discrete_cmap(N, base_cmap=None):
    """Create an N-bin discrete colormap from the specified input map"""

    # Note that if base_cmap is a string or None, you can simply do
    #    return plt.cm.get_cmap(base_cmap, N)
    # The following works for string, None, or a colormap instance:

    base = plt.cm.get_cmap(base_cmap)
    color_list = base(np.linspace(0, 1, N))
    cmap_name = base.name + str(N)
    return base.from_list(cmap_name, color_list, N)

# Torus latent space

In [None]:
#Classes
if set_name == "Synthetic":
    N = json_config["dataset"]["parameters"]["k"]
elif set_name == "MNIST":
    N = 10

In [None]:
#zlist = []
torus_ae.cpu()
colorlist = []
enc_list = []
feature_space_encoding_list = []
input_dataset_list = []
recon_dataset_list = []
for (data, labels) in tqdm( test_loader, position=0 ):
#for (data, labels) in tqdm( train_loader, position=0 ):
    input_dataset_list.append(data)
    recon_dataset_list.append(torus_ae(data)[0])
    feature_space_encoding_list.append(torus_ae.encoder_torus(data.view(-1,D)))
    #zlist.append(vae(data)[1])
    enc_list.append(torus_ae.encoder2lifting(data.view(-1,D)))
    colorlist.append(labels) 

In [None]:
#x = torch.cat(zlist)
#enc = circle2anglevectorized(x).detach()
input_dataset = torch.cat(input_dataset_list)
recon_dataset = torch.cat(recon_dataset_list)
encoded_points = torch.cat(enc_list)
feature_space_encoding = torch.cat(feature_space_encoding_list)
encoded_points_no_grad = encoded_points.detach()
color_array = torch.cat(colorlist).detach()
#assert torch.equal(enc,enc_tensor)

In [None]:
plt.figure(figsize=(8, 6))

if set_name == "MNIST":
    plt.scatter(encoded_points_no_grad[:,0],encoded_points_no_grad[:,1], c=color_array, marker='o', edgecolor='none', cmap=discrete_cmap(N, 'jet'))
    plt.colorbar(ticks=range(N))
plt.grid(True)
if violent_saving == True:
    plt.savefig(f"{Path_pictures}/latent_space.pdf",format="pdf")

# K-means in a local chart

In [None]:
kmeans_local_chart = KMeans(n_clusters=N, random_state=0, n_init="auto").fit(encoded_points_no_grad)
print(f"k-means clusterisation to {N} clusters")

In [None]:
plt.figure(figsize=(8, 6))

if set_name == "MNIST":
    plt.scatter(encoded_points_no_grad[:,0],encoded_points_no_grad[:,1], c=kmeans_local_chart.labels_, marker='o', edgecolor='none', cmap=discrete_cmap(N, 'jet'))
    plt.colorbar(ticks=range(N))
    plt.title(f"K-means clusterization, K = {N}, Euclidean metric in a local chart")
plt.grid(True)
if violent_saving == True:
    plt.savefig(f"{Path_pictures}/Kmeans_latent_space.pdf",format="pdf")

# K-means in feature space

In [None]:
kmeans_feature_space = KMeans(n_clusters=N, random_state=0, n_init="auto").fit(feature_space_encoding.detach())
print(f"k-means clusterisation to {N} clusters")

In [None]:
plt.figure(figsize=(8, 6))

if set_name == "MNIST":
    plt.scatter(encoded_points_no_grad[:,0],encoded_points_no_grad[:,1], c=kmeans_feature_space.labels_, marker='o', edgecolor='none', cmap=discrete_cmap(N, 'jet'))
    plt.colorbar(ticks=range(N))
    plt.title(f"K-means clusterization, Euclidean metric in feature space, K = {N}")
plt.grid(True)
if violent_saving == True:
    plt.savefig(f"{Path_pictures}/Kmeans_feature_space.pdf",format="pdf")