### Imports

In [4]:
import math
import torch
import torch.nn as nn
import torchvision
import numpy as np
import lightly
from tqdm import tqdm

### Habana Specific

In [5]:
from habana_frameworks.torch.utils.library_loader import load_habana_module
load_habana_module()
device = torch.device("hpu")

def habana():
 import habana_frameworks.torch.core as htcore
 htcore.mark_step()


def permute_params(model, to_filters_last):
    import habana_frameworks.torch.core as htcore
    if htcore.is_enabled_weight_permute_pass() is True:
        return
    with torch.no_grad():
        for name, param in model.named_parameters():
            if(param.ndim == 4):
                if to_filters_last:
                    param.data = param.data.permute((2, 3, 1, 0))
                else:
                    param.data = param.data.permute((3, 2, 0, 1))  # permute RSCK to KCRS
    habana()

def permute_momentum(optimizer, to_filters_last):
    import habana_frameworks.torch.core as htcore
    if htcore.is_enabled_weight_permute_pass() is True:
        return
    # Permute the momentum buffer before using for checkpoint
    for group in optimizer.param_groups:
        for p in group['params']:
            param_state = optimizer.state[p]
            if 'momentum_buffer' in param_state:
                buf = param_state['momentum_buffer']
                if(buf.ndim == 4):
                    if to_filters_last:
                        buf = buf.permute((2,3,1,0))
                    else:
                        buf = buf.permute((3,2,0,1))
                    param_state['momentum_buffer'] = buf
    habana()

Loading Habana modules from /usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/lib


### Configurations

In [6]:
num_workers = 8
batch_size = 128
seed = 1
epochs = 5
input_size = 160

# dimension of the output of the prediction and projection heads
out_dim = 2048

torch.manual_seed(0)
np.random.seed(0)

# set the path to the dataset
path_to_data = '/software/data/simsiam/PatternNet_unsup/'

### Train Dataloaders

In [7]:
# define the augmentations for self-supervised learning
collate_fn = lightly.data.ImageCollateFunction(
    input_size=input_size,
    # require invariance to flips and rotations
    hf_prob=0.5,
    vf_prob=0.5,
    rr_prob=0.5,
    # satellite images are all taken from the same height
    # so we use only slight random cropping
    min_scale=0.5,
    # use a weak color jitter for invariance w.r.t small color changes
    cj_prob=0.2,
    cj_bright=0.1,
    cj_contrast=0.1,
    cj_hue=0.1,
    cj_sat=0.1,
)

# create a lightly dataset for training, since the augmentations are handled
# by the collate function, there is no need to apply additional ones here
dataset_train_simsiam = lightly.data.LightlyDataset(
    input_dir=path_to_data
)

# create a dataloader for training
dataloader_train_simsiam = torch.utils.data.DataLoader(
    dataset_train_simsiam,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    drop_last=True,
    num_workers=num_workers
)

### SimSiam Model

In [8]:
class SimSiam(nn.Module):
    """
    Build a SimSiam model.
    """
    def __init__(self, base_encoder, dim=2048, pred_dim=256):
        """
        dim: feature dimension (default: 2048)
        pred_dim: hidden dimension of the predictor (default: 512)
        """
        super(SimSiam, self).__init__()

        # create the encoder
        # num_classes is the output fc dimension, zero-initialize last BNs
        self.encoder = base_encoder

        # build a 3-layer projector
        self.proj = nn.Sequential(nn.Linear(dim, dim, bias=False),
                                        nn.Unflatten(1, torch.Size([dim, 1, 1])),nn.BatchNorm2d(dim),nn.Flatten(),
                                        nn.ReLU(inplace=True), # first layer
                                        nn.Linear(dim, dim, bias=False),
                                        nn.Unflatten(1, torch.Size([dim, 1, 1])),nn.BatchNorm2d(dim),nn.Flatten(),
                                        nn.ReLU(inplace=True), # second layer
                                        nn.Linear(dim, dim, bias=False),
                                        nn.Unflatten(1, torch.Size([dim, 1, 1])),nn.BatchNorm2d(dim),nn.Flatten(),
                                        ) # output layer

        # build a 2-layer predictor
        self.predictor = nn.Sequential(nn.Linear(dim, pred_dim, bias=False),
                                        nn.Unflatten(1, torch.Size([pred_dim, 1, 1])),nn.BatchNorm2d(pred_dim),nn.Flatten(),
                                        nn.ReLU(inplace=True), # hidden layer
                                        nn.Linear(pred_dim, dim)) # output layer

    def forward(self, x1):
        x1 = self.encoder(x1).flatten(start_dim=1) # NxC
        z1 = self.proj(x1)
        p1 = self.predictor(z1) # NxC
        return  z1.detach(), p1


In [9]:

# we use a pretrained resnet for this tutorial to speed
# up training time but you can also train one from scratch
resnet = torchvision.models.resnet50()
backbone = nn.Sequential(*list(resnet.children())[:-1])
model = SimSiam(backbone)
model.to(device);

## Habana
permute_params(model, True)

### Optimizer

In [10]:
# SimSiam uses a symmetric negative cosine similarity loss
criterion =  nn.CosineSimilarity()


# scale the learning rate
lr = 0.05 * batch_size / 256

# use SGD with momentum and weight decay
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=lr,
    momentum=0.9,
    weight_decay=5e-4
)

## Habana
permute_momentum(optimizer, True)

### Training URL

In [11]:

avg_loss = 0.
avg_output_std = 0.
for e in range(epochs):
    for (x0, x1), _, _ in tqdm(dataloader_train_simsiam):
        x0 = x0.to(device)
        x1 = x1.to(device)

        # run the model on both transforms of the images
        # we get projections (z0 and z1) and
        # predictions (p0 and p1) as output
        z0, p0 = model(x0)
        z1, p1 = model(x1)

        # apply the symmetric negative cosine similarity
        # and run backpropagation
        loss = 0.5 * -1*(criterion(z0, p1).mean() + criterion(z1, p0).mean())
        loss.backward()
        habana()
        optimizer.step()
        optimizer.zero_grad()

        # calculate the per-dimension standard deviation of the outputs
        # we can use this later to check whether the embeddings are collapsing
        output = p0.detach().cpu()
        output = torch.nn.functional.normalize(output, dim=1)
        output_std = torch.std(output, 0)
        output_std = output_std.mean()

        # use moving averages to track the loss and standard deviation
        w = 0.9
        avg_loss = w * avg_loss + (1 - w) * loss.item()
        avg_output_std = w * avg_output_std + (1 - w) * output_std.item()

    # the level of collapse is large if the standard deviation of the l2
    # normalized output is much smaller than 1 / sqrt(dim)
    collapse_level = max(0., 1 - math.sqrt(out_dim) * avg_output_std)

    # print intermediate results
    print(f'[Epoch {e:3d}] '
        f'Loss = {avg_loss:.2f} | '
        f'Collapse Level: {collapse_level:.2f} / 1.00')


100%|██████████| 237/237 [04:46<00:00,  1.21s/it]


[Epoch   0] Loss = -0.85 | Collapse Level: 0.21 / 1.00


100%|██████████| 237/237 [04:34<00:00,  1.16s/it]


[Epoch   1] Loss = -0.91 | Collapse Level: 0.26 / 1.00


100%|██████████| 237/237 [04:38<00:00,  1.17s/it]


[Epoch   2] Loss = -0.92 | Collapse Level: 0.22 / 1.00


100%|██████████| 237/237 [04:41<00:00,  1.19s/it]


[Epoch   3] Loss = -0.93 | Collapse Level: 0.19 / 1.00


100%|██████████| 237/237 [04:40<00:00,  1.18s/it]

[Epoch   4] Loss = -0.93 | Collapse Level: 0.19 / 1.00





### Test Dataloaders

In [12]:
# Create Embeddings:
# create a torchvision transformation for embedding the dataset after training
# here, we resize the images to match the input size during training and apply
# a normalization of the color channel based on statistics from imagenet
test_transforms = torchvision.transforms.Compose([
    torchvision.transforms.Resize((input_size, input_size)),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

# create a lightly dataset for embedding
dataset_test = lightly.data.LightlyDataset(
    input_dir=path_to_data,
    transform=test_transforms
)

# create a dataloader for embedding
dataloader_test = torch.utils.data.DataLoader(
    dataset_test,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)


embeddings = []
filenames = []

# disable gradients for faster calculations
model.eval()
with torch.no_grad():
    for i, (x, _, fnames) in enumerate(tqdm(dataloader_test)):
        # move the images to the cpu
        x = x.to(device)
        # embed the images with the pre-trained backbone
        y = model.encoder(x).flatten(start_dim=1)
        # store the embeddings and filenames in lists
        embeddings.append(y)
        filenames = filenames + list(fnames)

print("Concatenating the embeddings and converting to numpy")
embeddings = torch.cat(embeddings, dim=0)
embeddings = embeddings.detach().cpu()


print("Saving embeddings...")
np.save('output/embeddings', embeddings.numpy())

print("Saving filenames...")
with open("output/filenames.txt", "w") as file:
    file.write(str(filenames))

100%|██████████| 238/238 [00:38<00:00,  6.20it/s]


Concatenating the embeddings and converting to numpy
Saving embeddings...
Saving filenames...
