# Two-layer custom CNN from MOSAIKS

## Prepare GeoLife data loader

In [1]:
import os
import sys
import time
import inspect
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import timm

from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.image import extract_patches_2d

CURR_DIR = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
PARENT_DIR = os.path.dirname(CURR_DIR)
sys.path.insert(0, "/home/mila/s/sara.ebrahim-elkafrawy/scratch/ecosystem_project/remote_sensing")

from dataset.pytorch_dataset import GeoLifeCLEF2022Dataset
from torch.utils.data import random_split, DataLoader

random_state = np.random.RandomState(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = "/network/scratch/s/sara.ebrahim-elkafrawy/small_geo_data"  # "/network/scratch/s/sara.ebrahim-elkafrawy/" 
split = "train"
use_ffcv_loader = False
num_species= 17037
bands = ["rgb"] 
batch_size = 1
num_workers = 0

In [3]:
geo_train_dataset = GeoLifeCLEF2022Dataset(
                root=data_dir,
                subset=split,
                use_ffcv_loader=use_ffcv_loader,
                region="both",
                patch_data=bands,
                use_rasters=False,
                patch_extractor=None,
                transform=None,
                target_transform=None,
                opts=None,
            )

geo_train_loader = DataLoader(
                geo_train_dataset,
                batch_size=batch_size,
                num_workers=num_workers,
                shuffle=True,
                pin_memory=True,
            )

## define custom CNN

In [4]:
# class CustomCNN(nn.Module):
#     def __init__(self):
#         super(CustomCNN, self).__init__()
#         self.add_module('conv1', nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding='same', bias=True))
#         self.add_module(nn.ReLU())
#         self.add_module(nn.MaxPool2d(2, stride=2))
        
#         self.add_module('conv2', nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding='same', bias=True))
#         self.add_module(nn.ReLU())
#         self.add_module(nn.MaxPool2d(2, stride=2))

#         self.add_module(nn.Flatten())
#         self.add_module(nn.Dropout(0.5))
model = nn.Sequential(
      nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding='same', bias=True),
      nn.ReLU(),
      nn.MaxPool2d(2, stride=2),

      nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding='same', bias=True),
      nn.ReLU(),
      nn.MaxPool2d(2, stride=2),

      nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding='same', bias=True),
      nn.ReLU(),
      nn.MaxPool2d(2, stride=2),

      nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding='same', bias=True),
      nn.ReLU(),
      nn.MaxPool2d(2, stride=2),

      nn.Flatten(),
      nn.Dropout(0.5),
      nn.Linear(65536, 512), #50176
      nn.ReLU(),
      nn.Linear(512, num_species)
      ) 
model(torch.rand((1, 3, 256, 256))).shape

torch.Size([1, 17037])

## initialize weights for the first layer with KMeans

In [5]:
patch_size = (3, 3)
num_feats = 32  # also number of patches
num_iters = 6   # The online learning part: cycle over the whole dataset 6 times
max_patches = 50

In [6]:
kmeans = MiniBatchKMeans(n_clusters=num_feats, 
                         random_state=random_state,
#                          max_no_improvement=5,
#                          tol=0.01,
#                          max_iter=5,
#                          batch_size = kmeans_bs,
                         verbose=True)

In [7]:
t0 = time.time()

index = 0
buffer = []

for _ in range(num_iters):
    for batch in geo_train_loader:
        patches, target, meta = batch
        img_np = patches['rgb'].numpy()
        img_np = img_np.squeeze(0)
        img_np = np.einsum('ijk->jki', img_np)

        data = extract_patches_2d(img_np, patch_size, max_patches=max_patches, random_state=random_state)
        data = np.reshape(data, (len(data), -1))
        buffer.append(data)
        index += 1
        if index % int(len(geo_train_loader)/10) == 0:
            data = np.concatenate(buffer, axis=0)
            data -= np.mean(data, axis=0)
            data /= np.std(data, axis=0)
            kmeans.partial_fit(data)
            buffer = []
    #             print(f'inertia: {kmeans.inertia_}')
        if index % 10000 == 0:
            print("Partial fit of %4i out of %i" % (index, num_iters * len(geo_train_loader)))

dt = time.time() - t0
print("done in %.2fs." % dt)

Partial fit of 10000 out of 12024
done in 41.43s.


In [8]:
img_np.shape, kmeans.cluster_centers_.shape

((256, 256, 3), (32, 27))

In [9]:
kmeans.cluster_centers_.min(), kmeans.cluster_centers_.max()

(-1.8748642, 2.6060042)

## slicing up the network

In [10]:
model

Sequential(
  (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (1): ReLU()
  (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (4): ReLU()
  (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (6): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (7): ReLU()
  (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (9): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (10): ReLU()
  (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (12): Flatten(start_dim=1, end_dim=-1)
  (13): Dropout(p=0.5, inplace=False)
  (14): Linear(in_features=65536, out_features=512, bias=True)
  (15): ReLU()
  (16): Linear(in_features=512, out_features=17037, bias=True)
)

In [11]:
conv_lyrs = [3, 6, 9]
act_lyrs = [1, 4, 7, 10]

In [29]:
list(model.named_parameters())[4][1].data.shape, model[4]

(torch.Size([128, 64, 3, 3]), ReLU())

In [12]:
idx = 0
for name, param in model.named_parameters():
    print(name, '---------------\t', param.shape)

0.weight ---------------	 torch.Size([32, 3, 3, 3])
0.bias ---------------	 torch.Size([32])
3.weight ---------------	 torch.Size([64, 32, 3, 3])
3.bias ---------------	 torch.Size([64])
6.weight ---------------	 torch.Size([128, 64, 3, 3])
6.bias ---------------	 torch.Size([128])
9.weight ---------------	 torch.Size([256, 128, 3, 3])
9.bias ---------------	 torch.Size([256])
14.weight ---------------	 torch.Size([512, 65536])
14.bias ---------------	 torch.Size([512])
16.weight ---------------	 torch.Size([17037, 512])
16.bias ---------------	 torch.Size([17037])


In [13]:
# set the weights for hte first layer
list(model.named_parameters())[0][1].data = torch.from_numpy(kmeans.cluster_centers_.reshape(32, 3, 3, 3))

In [19]:
x = torch.from_numpy(kmeans.cluster_centers_.reshape(32, 3, 3, 3))
norm_param = (x - x.mean())/(x.std())
model[0].weight.data = norm_param

In [20]:
list(model.named_parameters())[0][1].data.min(), list(model.named_parameters())[0][1].data.max()

(tensor(-1.9434), tensor(2.5193))

In [21]:
kmeans.cluster_centers_.min(), kmeans.cluster_centers_.max()

(-1.8748642, 2.6060042)

## with hooks

In [22]:
features_dim = {}
def get_features(name):
    def hook(model, input, output):
        features_dim[name] = output.detach()
    return hook

In [23]:
model[1].register_forward_hook(get_features('relu_layer_1'))
model[4].register_forward_hook(get_features('relu_layer_4'))
model[7].register_forward_hook(get_features('relu_layer_7'))
model[10].register_forward_hook(get_features('relu_layer_10'))

<torch.utils.hooks.RemovableHandle at 0x7f2cdd7773a0>

In [30]:
for layer_idx, relu_idx in enumerate(act_lyrs):

    if layer_idx+1 == len(act_lyrs):
        break
        
    print(f'kmeans for output of relu act layer#{relu_idx}')
    
    curr_param_idx = conv_lyrs[layer_idx]
    curr_param_sz = model[curr_param_idx].weight.data.shape

#     curr_param_sz = list(model.named_parameters())[curr_param_idx][1].data.shape
    print(f'current parameter size: {curr_param_sz}')
    print(f'current parameter index: {curr_param_idx}')
    
#     curr_feat_dim = list(model.named_parameters())[relu_idx][1].shape
    
#     print(f'current feature layer size: {curr_feat_dim}')
    
    num_feats = curr_param_sz[0]
    num_ch = curr_param_sz[1]
    patch_size = (curr_param_sz[2], curr_param_sz[3])
    num_iters = 3   # The online learning part: cycle over the whole dataset 6 times
    max_patches = int(num_feats/10)

#     print(f'current feature map dim: {curr_feat_dim}')
    print(f'Initializing parameter#{curr_param_idx} with size: {curr_param_sz}')

    print(f'num_feats:{num_feats}, num_ch:{num_ch}, patch_size:{patch_size}')
    kmeans = MiniBatchKMeans(n_clusters=num_feats, 
                             random_state=random_state,
    #                          max_no_improvement=5,
    #                          tol=0.01,
    #                          max_iter=5,
    #                          batch_size = kmeans_bs,
                             verbose=True)

    geo_train_loader = DataLoader(
                    geo_train_dataset,
                    batch_size=batch_size,
                    num_workers=num_workers,
                    shuffle=True,
                    pin_memory=True,
                )

    index = 0
    buffer = []

    for _ in range(num_iters):
        for batch in geo_train_loader:
            patches, target, meta = batch

            output = model(patches['rgb'])
            curr_feats = features_dim[f'relu_layer_{relu_idx}'].numpy() #.cpu().numpy())
            curr_feats = curr_feats.squeeze(0)
            curr_feats = np.einsum('ijk->jki', curr_feats)

            data = extract_patches_2d(curr_feats, patch_size, max_patches=max_patches, random_state=random_state)


            data = np.reshape(data, (len(data), -1))
            buffer.append(data)
            index += 1
            if index % int(len(geo_train_loader)/10) == 0:
        #         print(data)
                data = np.concatenate(buffer, axis=0)
                data -= np.mean(data, axis=0)
                data /= np.std(data, axis=0)
                if np.any(np.isnan(data)):
                    data = np.nan_to_num(data)
                kmeans.partial_fit(data)
                buffer = []
            if index % 10000 == 0:
                print("Partial fit of %4i out of %i" % (index, num_iters * len(geo_train_loader)))


    
    # change the weights of the corresponding conv layer
    x = torch.from_numpy(kmeans.cluster_centers_.reshape(
                                            num_feats, 
                                            num_ch, 
                                            patch_size[0], 
                                            patch_size[1])
                                        )
    norm_param = (x - x.mean())/(x.std())
    model[curr_param_idx].weight.data = norm_param

#     list(model.named_parameters())[curr_param_idx][1].data = torch.from_numpy(
#         kmeans.cluster_centers_.reshape(
#                                             num_feats, 
#                                             num_ch, 
#                                             patch_size[0], 
#                                             patch_size[1])
#                                         )

    # save the model
    PATH = "/home/mila/s/sara.ebrahim-elkafrawy/scratch/ecosystem_project/ckpts/custom_mosaiks_kmeans.pt"
    torch.save(model.state_dict(), PATH)


kmeans for output of relu act layer#1
current parameter size: torch.Size([64, 32, 3, 3])
current parameter index: 3
Initializing parameter#3 with size: torch.Size([64, 32, 3, 3])
num_feats:64, num_ch:32, patch_size:(3, 3)


RuntimeError: Given weight of size [64, 32, 3, 3], expected bias to be 1-dimensional with 64 elements, but got bias of size [64, 32, 3, 3] instead