# Train clustering-based embedding

1. Compute the clustering of the geographic-based embedding, $f^g$
2. Sample a new triplets according to the clustering
3. Train the clustering-based embedding

## Load Libraries, set paths and parameters

In [2]:
import sys
import os
import torch
from torch import optim
from time import time
import numpy as np
sys.path.append('../')
from src.datasets import TileTripletsDataset, GetBands, RandomFlipAndRotate, ClipAndScale, ToFloatTensor, triplet_dataloader
from src.tilenet import make_tilenet
from src.training import prep_triplets, train_triplet_epoch
from src.aux_functions import *

from torch.autograd import Variable
import tslearn
from tslearn.clustering import TimeSeriesKMeans

import shutil
from sklearn.decomposition import PCA

In [3]:
# Directory with the MTS
area = 'NE-TXN'
mts_dir = os.path.join('../data', area, 'MTS')

# Directory where the sequences of tiles have been saved
tile_dir = '../data/NE-TXN/tiles' # by default

# Directory to save the clustering-based triplets
triplets_dir = '../data/NE-TXN/triplets_clustering'
if not os.path.exists(triplets_dir):
    os.makedirs(triplets_dir)

# Parameters
n_samples = 110*110
n_elems = 3
num_triplets= 100
z_dim = 512
in_channels = 3
lr = 1e-3

## Geographic-based embeddeding $f^g$ and clustering $\mathcal{P}^g$

Load geographic-based embedded sequences of tiles in the np array X of dimensions (n_samples, n_elems, z_dim)

In [4]:
X = np.load(mts_dir + '/X_epoch50_512_100_100K_NE-TXN.npy')

In [5]:
Dist_mat_fg = np.load('distances_NE-TXN.npy')

Run the K-means algorithm over the MTS encoded with $f^g$

In [6]:
n_clus = 5
ts_clustering_fg = TimeSeriesKMeans(n_clusters=n_clus, metric="euclidean", n_init=10, max_iter_barycenter=5).fit(X)
ts_clusters_fg = ts_clustering_fg.labels_
ts_centroids_fg = ts_clustering_fg.cluster_centers_

## Generate new data set of triplets

Generate triplets according to the neighborhood given by the clustering of MTS. The neighbor tile belongs to the same cluster as the anchor while the distant tile belongs to a different cluster.

In [12]:
for t in range(n_elems):
    # Generate triplets according to the clustering
    new_triplets= generateTriplets(list(ts_clusters_fg), numTriplets=num_triplets)
    for j in range(num_triplets):
        x_a = new_triplets[j][0]
        x_n = new_triplets[j][1]
        x_d = new_triplets[j][2]
        # Select the tiles of the triplet according to the time and number
        tile_a = os.path.join(tile_dir, '{sample}tile_T{t}.npy'.format(sample=x_a, t=t))
        tile_n = os.path.join(tile_dir, '{sample}tile_T{t}.npy'.format(sample=x_n, t=t))
        tile_d = os.path.join(tile_dir, '{sample}tile_T{t}.npy'.format(sample=x_d, t=t))
        # Save the tiles according to the triplet
        tile_a_dest= os.path.join(triplets_dir, '{}anchor.npy'.format(j + num_triplets*t))
        tile_n_dest= os.path.join(triplets_dir, '{}neighbor.npy'.format(j + t*num_triplets))
        tile_d_dest= os.path.join(triplets_dir, '{}distant.npy'.format(j + t*num_triplets))
        
        shutil.copy(tile_a, tile_a_dest)
        shutil.copy(tile_n, tile_n_dest)
        shutil.copy(tile_d, tile_d_dest)

In [15]:
new_triplets[:5]

[(3781, 4726, 2386),
 (4939, 6011, 224),
 (5127, 6246, 8892),
 (5719, 4806, 9567),
 (7583, 5114, 308)]

## Train the clustering-based embedding $f^c$

### Load the geographic-based embedding

In [None]:
# Initialize model
TileNet = make_tilenet(in_channels=in_channels, z_dim=z_dim)
# Load previous model parameters
checkpoint = torch.load('../models/TileNet_epoch50_512_100_100K_sentinel.ckpt')
TileNet.load_state_dict(checkpoint)
optimizer = optim.Adam(TileNet.parameters(), lr=lr, betas=(0.5, 0.999))
TileNet.train()

### Parameters for the training

In [20]:
# Environment stuff
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
cuda = torch.cuda.is_available()

In [46]:
img_type = 'rgb'
bands = 3
augment = True
batch_size = 50
shuffle = True
num_workers = 8
n_triplets = num_triplets*n_elems
epochs = 5
margin = 50
l2 = 0.01
print_every = 50
save_models = True

In [47]:
dataloader = triplet_dataloader(img_type, triplets_dir, bands=bands, augment=augment,
                                batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, 
                                n_triplets=n_triplets, pairs_only=True)
print('Dataloader set up complete.')

Dataloader set up complete.


In [48]:
# Directory to save the model
results_fn = os.path.join('../models/results_fn')

### Training

In [None]:
t0 = time()
with open(results_fn, 'w') as file:

    print('Begin training.................')
    for epoch in range(0, epochs):
        (avg_loss, avg_l_n, avg_l_d, avg_l_nd) = train_triplet_epoch(
            TileNet, cuda, dataloader, optimizer, epoch+1, margin=margin, l2=l2,
            print_every=print_every, t0=t0)

In [50]:
# Save model after last epoch
torch.save(TileNet.state_dict(), '../models/TileNet_clustering-based.ckpt')