# Autoencoder via pytorch compared with ipca and t-sne

## Principal component analysis for linear reduction

### Implementation from the python module sklearn


In [None]:
def towdir(s):
    return (str('./datasets_book/'+s))

import deepglmlib.utils as utils
import numpy as np

In [None]:
import importlib
importlib.reload(utils)

#### Example of a dataset with clusters in 3d

 

In [None]:
cols = ["blue","red","green","orange","purple","brown","olive","magenta","cyan", "black"]

In [None]:
import numpy as np

xy = np.load(towdir('x_y_10d_ae.npy'))
x = xy["x"]
y = xy["y"]

x.shape, y.shape

In a three dimensional view, the data are as follows when showing the three first components. 

#### PCA with sklearn

In [None]:
import numpy as np
from sklearn.decomposition import PCA
UPCA = PCA(n_components=2)
z_pca = UPCA.fit_transform(x)

In [None]:
title = title="Projection of 3d dataset with pca"
utils.f_plot_scatter(z_pca, y, title=title, isellipse=True)

#### Incremental PCA with sklearn

In [None]:
from sklearn.decomposition import IncrementalPCA
IPCA   = IncrementalPCA(n_components=2, batch_size=10)
z_ipca = IPCA.fit_transform(x)
title  = "Projection of 3d dataset with ipca"
utils.f_plot_scatter(np.vstack([-z_ipca[:,0], -z_ipca[:,1]]).T, y, title=title, isellipse=True)

#### Kernel PCA with sklearn

In [None]:
from sklearn.decomposition import KernelPCA
KPCA   = KernelPCA(n_components=2, kernel='sigmoid')
# KPCA   = KernelPCA(n_components=2, kernel='rbf', alpha=0.3)
z_kpca = KPCA.fit_transform(x)
title  = "Projection of 3d dataset with kpca"
utils.f_plot_scatter(z_kpca, y , title=title, isellipse=True)

#### t-sne with sklearn

In [None]:
from sklearn.manifold import TSNE
z_tsne = TSNE(n_components=2, init='pca').fit_transform(x)
title  = "Projection of 3d dataset with tsne"
utils.f_plot_scatter(z_tsne, y ,title=title, isellipse=True)

#### Quality indicators for comparing the visualisations

In [None]:
def f_score_projection(z,y,name="",show=False):
    y = y.ravel()
    import numpy as np
    from sklearn.metrics import davies_bouldin_score
    from sklearn.metrics import silhouette_score
    db = davies_bouldin_score(z, y)
    sl = silhouette_score(z, y)
    if show is True:
        print("Davies_Bouldin_score of",name, "=",np.round(db,3), 
              "\nSilhouette_score     of",name, "=",np.round(sl,3))
    return db, sl

In [None]:
db_pca, sl_pca = f_score_projection(z_pca,y,"pca",False)
db_ipca, sl_ipca = f_score_projection(z_ipca,y,"ipca",False)
db_kpca, sl_kpca = f_score_projection(z_kpca,y,"kpca",False)
db_tsne, sl_tsne = f_score_projection(z_tsne,y,"tsne",False)

## Autoencoders  for linear and nonlinear reduction with pytorch

### Example with the artificial dataset

In [None]:
import torch.nn as nn

class AutoEncoder(nn.Module):
    def __init__(self, name, layers_encoder, layers_decoder, init_layers = None):
        super().__init__()
        self.name = name
        self.layers_encoder = layers_encoder
        self.layers_decoder = layers_decoder
        self.net_encoder = nn.Sequential(*layers_encoder)
        self.net_decoder = nn.Sequential(*layers_decoder)
        self.init_layers = init_layers
        if self.init_layers is not None:
            for k in self.init_layers:
                torch.nn.init.xavier_uniform_(self.net_encoder[k].weight)
                torch.nn.init.xavier_uniform_(self.net_decoder[k].weight)        
    
    def forward(self, x):
        encoded = self.net_encoder(x)
        decoded = self.net_decoder(encoded)
        return decoded
    
    def encoder(self,x):
        encoded = self.net_encoder(x)
        return encoded
    
    def decoder(self,z):
        decoded = self.net_decoder(z)
        return decoded

In [None]:
import torch

import deepglmlib.utils as utils

dataset = torch.utils.data .TensorDataset( torch.from_numpy(x).float(), torch.from_numpy(y).int() )

dl_train, dl_test, n, n_train, n_test = utils.f_splitDataset(dataset)  # batch size ???

In [None]:
import torch.nn as nn
import copy

layers_encoder = []
layers_encoder.append(nn.Linear(x.shape[1],7, bias=True))
layers_encoder.append(nn.Linear(7, 4))
layers_encoder.append(nn.Linear(4, 2))

layers_decoder = []
layers_decoder.append(nn.Linear(2, 4))
layers_decoder.append(nn.Linear(4, 7))
layers_decoder.append(nn.Linear(7,x.shape[1]))


#### Training and output

In [None]:
def f_train_autoencoder(dl_train,autoencoder,nbmax_epoqs,lr,device=None,epoch_print=5):
    optimizer = torch.optim.Adam(autoencoder.parameters(), lr=lr)
    loss = nn.MSELoss(reduction='sum')
    loss_s = np.zeros(nbmax_epoqs)
    if device is not None: autoencoder=autoencoder.to(device)
    autoencoder.train()
    t=0
    for epoch in range(nbmax_epoqs):
        loss_t = 0 
        for step, tuple_b in enumerate(dl_train):
            xb = tuple_b[0]
            yb = tuple_b[1]
            if device is not None:
                xb=xb.to(device)
                yb=yb.to(device)
            xb_hat = autoencoder(xb)
            lossb = loss(xb_hat, xb)       
            optimizer.zero_grad()               
            lossb.backward()                     
            optimizer.step()
            loss_t += lossb
        loss_s[t] = loss_t
        if epoch % epoch_print == 0 or (epoch == nbmax_epoqs-1 and epoch_print<=nbmax_epoqs):
            print("t=",t,"\tloss=",np.round(loss_t.detach().cpu().numpy(),5))
        t+=1
    
    autoencoder.eval()
    tmax = t
    return loss_s, tmax

In [None]:
autoencoder =  AutoEncoder("AE-3-2",copy.deepcopy(layers_encoder),copy.deepcopy(layers_decoder))

In [None]:
loss_train_s, tmax = f_train_autoencoder(dl_train,autoencoder,1000,0.0001,epoch_print=100)

In [None]:
z_ae = autoencoder.encoder(torch.from_numpy(x).float()).detach().numpy()

In [None]:
print(z_ae.shape)

In [None]:
title = "Projection of 3d dataset with ae"
utils.f_plot_scatter(z_ae, y, title=title, isellipse=True)

In [None]:
db_ae, sl_ae = f_score_projection(z_ae,y,"ae",False)

#### Nonlinear autoencoder and training

 

In [None]:
layers_encoder = []
layers_encoder.append(nn.Linear(x.shape[1],7, bias=True))
layers_encoder.append(nn.Tanh())
layers_encoder.append(nn.Linear(7, 4))
layers_encoder.append(nn.Tanh())
layers_encoder.append(nn.Linear(4, 2))

layers_decoder = []
layers_decoder.append(nn.Linear(2, 4))
layers_decoder.append(nn.Tanh())
layers_decoder.append(nn.Linear(4, 7))
layers_decoder.append(nn.Tanh())
layers_decoder.append(nn.Linear(7,x.shape[1]))

autoencoder_nl =  AutoEncoder("AE-",copy.deepcopy(layers_encoder),copy.deepcopy(layers_decoder), [0,2])

In [None]:
loss_train_nl_s, tmax_nl = f_train_autoencoder(dl_train,autoencoder_nl,1500,0.001,epoch_print=300)

In [None]:
z_ae_nl = autoencoder_nl.encoder(torch.from_numpy(x).float()).detach().numpy()

In [None]:
utils.title = "Projection of 3d dataset with nonlinear ae"
utils.f_plot_scatter(z_ae_nl, y, title=title, isellipse=True)

In [None]:
db_ae_nl, sl_ae_nl = f_score_projection(z_ae_nl,y,"ae_nl",False)

#### Comparison of different visualisations with two indicators

In [None]:
import pandas as pd

method_s = ["pca (400)", "ipca (400)", "kpca (400)", "t-sne (400)", 
            "linear ae (400)", "non linear ae (400)"]

db_s = [db_pca, db_ipca, db_kpca, db_tsne, db_ae, db_ae_nl]

sl_s = [sl_pca, sl_ipca, sl_kpca, sl_tsne, sl_ae, sl_ae_nl]

results = [method_s, db_s, sl_s]
results_pd = pd.DataFrame(results).transpose()
results_pd.columns = ["method (sample size)", "davis-bouldin", "silhouettes"]

with pd.option_context('float_format', '{:.4f}'.format, 'display.expand_frame_repr', False):
    print(results_pd)

### Autoencoder and t-sne with a dataset of 60000 images

#### Dataset (from the image files to one binary hdf5 file)

In [None]:
import torch; torch.manual_seed(0)
import torch.nn as nn
import torch.nn.functional as F
import torch.utils
import torch.distributions
import torchvision
import numpy as np
import matplotlib.pyplot as plt#; plt.rcParams['figure.dpi'] = 200

In [None]:
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST

dir_mnist = towdir("")

dl_train = DataLoader( MNIST(dir_mnist, train=True, download=True,
                             transform=torchvision.transforms.ToTensor()),
                       batch_size=200, shuffle=True)

dl_test  = DataLoader( MNIST(dir_mnist, train=False, download=True,
                             transform=torchvision.transforms.ToTensor()),
                       batch_size=200, shuffle=True)

In [None]:
utils.f_save_dataloader_to_h5py(dl_train,towdir("mnist60000.h5"),28*28)

In [None]:
dataset_mnist = utils.DatasetH5(towdir('mnist60000.h5'),'x','y') 
print(dataset_mnist.x.shape, dataset_mnist.y.shape)

In [None]:
dl_train, dl_test, n, n_train, n_test = utils.f_splitDataset(dataset_mnist,0.8,100)

In [None]:
print(n,n_train,n_test)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Result from t-sne

#### Example of file processing with a random projection

In [None]:
dl_all = DataLoader(dataset_mnist,shuffle=False,batch_size=250)

In [None]:
filename_x = towdir('x_mnist60000.memmap')
filename_y = towdir('y_mnist60000.memmap')

In [None]:
n = dataset_mnist.x.shape[0]
p = dataset_mnist.x.shape[1]

utils.f_save_dl_xy_to_2memmap(dl_all, filename_x, filename_y,
                              n=n, p=p, is_index=True)

In [None]:
i_s = np.random.binomial(size=n,p=0.05,n=2) 

In [None]:
import numpy as np

In [None]:
def f_read_memmap(filename_x,n,p):
    x_map = np.memmap(filename_x, dtype='float32', 
                      mode='r', shape=(n,p))
    return x_map

def f_write_memmap(filename_x,n,p):
    x_map = np.memmap(filename_x, dtype='float32', 
                      mode='w+', shape=(n,p))
    return x_map


In [None]:
from sklearn import random_projection

for tried in range(1,10):
    print("---------------------------------------------")
    print("tried=",tried,"/10")
    
    n, p = (60000, 784)
    x_map = f_read_memmap(filename_x,n,p)
    y_map = f_read_memmap(filename_y,n,1)    
    
    print("compute random projection")
    GaussianRP          = random_projection.GaussianRandomProjection
    p_random            = 120
    mapper              = GaussianRP(n_components=p_random)
    x_map_mnist_reduced = mapper.fit_transform(x_map[0:100,:])
    RR                  = mapper.components_
    np.savetxt("./datasets_book/RR_120_784_mnist60000.txt",RR)

    RR = np.loadtxt(towdir("RR_120_784_mnist60000.txt"))

    filename_xin  = towdir('x_mnist60000.memmap')
    filename_xout = towdir("x_rp120_mnist60000.memmap")

    utils.f_save_to_reduction_to_memmap_files(filename_xin,
                                        filename_xout,
                                        R=RR.transpose(),
                                        n = x_map.shape[0],
                                        size_minibatch = 250)

    filename_z = towdir("x_rp120_mnist60000.memmap")

    n = 60000
    p = 120

    z_rp_mnist = f_read_memmap(filename_z,n,p)
    y_mnist   = y_map

    z_rp_mnist.shape, y_mnist.shape

    mn = np.sum(z_rp_mnist,axis=0)          # by chunks?
    sd = np.sqrt(np.var(z_rp_mnist,axis=0)) # by chunks?

    filename_zout = towdir("x_rp120_mnist6000_standardized.memmap")

    z_rp_mnist_strd = np.memmap(filename_zout, 
                                dtype='float32', mode='w+', 
                                shape=(n,120))

    size_chunks = 100

    for idx_b in range(0, n, size_chunks):
        idx_b2 = np.min( [idx_b+size_chunks,n] )
        zb                    = z_rp_mnist[idx_b:idx_b2,:]
        z_rp_mnist_strd[idx_b:idx_b2,:] = (zb - mn)/sd

    del z_rp_mnist_strd



    size_chunks = 100
    n_components = 50


    z_rp_mnist_strd = \
      f_read_memmap( towdir("x_rp120_mnist6000_standardized.memmap"), 
                     n,120 )

    z_ipca_150rp    = \
       f_write_memmap( towdir("z_ipca50_150rp_mnist6000.memmap"),
                       n,n_components )

    from sklearn.decomposition import IncrementalPCA
    ipca = IncrementalPCA( n_components= n_components, 
                           batch_size= size_chunks )

    print("compute transformation pca")
    # for epoch in range(5):
    for idx_b in range(0, n, size_chunks):
        idx_b2 = np.min( [idx_b+size_chunks,n] )
        ipca.partial_fit(z_rp_mnist_strd[idx_b:idx_b2,:])

    print("compute reduced coordinates")
    for idx_b in range(0, n, size_chunks):
        idx_b2 = np.min( [idx_b+size_chunks,n] )
        z_ipca_150rp[idx_b:idx_b2,:] = \
          ipca.transform(z_rp_mnist_strd[idx_b:idx_b2,:])

    del z_rp_mnist_strd, z_ipca_150rp
    
    print("compute t-sne mapping")
    z_ipca_150rp    = \
       f_read_memmap( towdir("z_ipca50_150rp_mnist6000.memmap"),
                       60000,50 )


    x_map_init = copy.deepcopy( z_ipca_150rp[:,0:2] )
    x_map = z_ipca_150rp

    z_mnist_tsne_2, z_init_mnist_tsne_2, aff_mnist_2 = \
        utils.f_projection_from_openTSNE(x_map = x_map,
                                   x_map_init=x_map_init,
                                   perplexity=30,n_jobs=3,
                                   random_state=0,verbose=False)

    np.savetxt("./datasets_book/z_mnist_tsne_2"+"_tried"+str(tried)+".txt",z_mnist_tsne_2)
    np.savetxt("./datasets_book/y_mnist_2.txt",y_mnist)
    y_mnist_2      = y_mnist

    title="Projection of mnist dataset with tnse after rand-proj"
    utils.f_plot_scatter(z_mnist_tsne_2, y_mnist, title=title, isellipse=True)

    print("compute t-sne quality")
    db_tsne_mnist_2, sl_tsne_mnist_2 = \
       f_score_projection(z_mnist_tsne_2[i_s==1,:],
       y_mnist[i_s==1].ravel(),
       "tsne-mnist-after-rand-proj",False)

    print(np.round(db_tsne_mnist_2,3),
          np.round(sl_tsne_mnist_2,3))
    