In [1]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import json 
from torch.utils.data import Dataset, DataLoader
from PIL import Image, ImageDraw

from torchvision import transforms
from tqdm import tqdm
import cv2
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage.filters import gaussian_filter
import sys 
sys.path.append('../..')
import src.utils as utils
import src.clip as clip 
import yaml
import math 
from tqdm import tqdm  
# from src.clip_led.dataset import LEDDataset
# from src.clip_led.model import LEDModel 
# from src.clip_led.engine import train_model, eval_model
import src.fusion as fusion
from src.blocks import Up, ConvBlock, IdentityBlock

In [2]:
import json 
import torch 
from torch.utils.data import Dataset 
from torchvision import transforms
from PIL import Image, ImageDraw
from scipy.ndimage.filters import gaussian_filter
import numpy as np 
import src.clip as clip 

class LEDDataset(Dataset):
    def __init__(self, data_path, image_dir, config):

        # Gather train_data from {train/val/test}_data.json
        self.data_path = data_path 
        self.data_file = open(self.data_path)
        self.data = json.load(self.data_file)

        # Extract the mode (train, valSeen, valUnseen) from the data_path 
        self.mode = self.data_path.split('/')[-1][:-5].split('_')[0]

        # Store access to floorplans directory 
        self.image_dir = image_dir 

        # Save the global config 
        self.config = config 

        # Calculate parameters to adjust location based on scaling and cropping
        self.crop_translate_x = (self.config['original_image_size'][2] - self.config['cropped_image_size'][2])/2 
        self.crop_translate_y = (self.config['original_image_size'][1] - self.config['cropped_image_size'][1])/2 

        self.resize_scale_x = self.config['scaled_image_size'][2] / self.config['cropped_image_size'][2]
        self.resize_scale_y = self.config['scaled_image_size'][1] / self.config['cropped_image_size'][1]

        # mesh2meters
        self.mesh2meters_path = self.config['mesh2meters']
        self.mesh2meters_file = open(self.mesh2meters_path)
        self.mesh2meters = json.load(self.mesh2meters_file)

        # transform required for CLIP 
        def convert_image_to_rgb(image):
            return image.convert("RGB")

        self.preprocess = transforms.Compose([
            transforms.CenterCrop((self.config['cropped_image_size'][1], self.config['cropped_image_size'][2])),
            transforms.Resize(size=(self.config['scaled_image_size'][1], self.config['scaled_image_size'][2]), interpolation=transforms.InterpolationMode.BICUBIC, max_size=None, antialias=None),
            convert_image_to_rgb,
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
        ])
        self.preprocess_visualize = transforms.Compose([
            transforms.CenterCrop((self.config['cropped_image_size'][1], self.config['cropped_image_size'][2])),
            transforms.Resize(size=(self.config['scaled_image_size'][1], self.config['scaled_image_size'][2]), interpolation=transforms.InterpolationMode.BICUBIC, max_size=None, antialias=None),
            convert_image_to_rgb,
        ]
        )


    def gather_all_floors(self, index):
        ''' Collect images for all floor for a given index
            Ouptut: Maps(max_floors, (image_size)), Conversions(max_floors, 1)
        '''

        # Create empty tensors to hold maps and conversions 
        all_maps = torch.zeros(
            self.config['max_floors'],
            self.config["image_size"][0],
            self.config["image_size"][1],
            self.config["image_size"][2],
        )
        all_conversions = torch.zeros(self.config["max_floors"], 1)

        # Extract scan_names and which floors that scan has from the data 
        scan_name = self.data[index]['scanName']
        floors = self.mesh2meters[scan_name].keys()

        # Iterate through each floor of a scan, open the image, preprocess it and convert it to a tensor 
        for enum, floor in enumerate(floors):
            img = Image.open(f'{self.image_dir}floor_{floor}/{scan_name}_{floor}.png').convert('RGB')
            if "train" in self.mode:
                all_maps[enum, :, :, :] = self.preprocess(img)[:3, :, :]
            else:
                all_maps[enum, :, :, :] = self.preprocess(img)[:3, :, :]
            all_conversions[enum, :] = self.mesh2meters[scan_name][floor]["threeMeterRadius"] / 3.0
        return all_maps, all_conversions

    def gather_correct_floor(self, index):
        scan_name = self.data[index]['scanName']
        x, y, floor = self.scale_location(index)
        img = Image.open(f'{self.image_dir}floor_{floor}/{scan_name}_{floor}.png').convert('RGB')
        
        map = self.preprocess(img)
        conversion = torch.tensor(self.mesh2meters[scan_name][str(floor)]["threeMeterRadius"] / 3.0).float()

        return map, conversion

    def scale_location(self, index):
        if "test" in self.mode:
            return [0, 0, 0]

        floor = self.data[index]['finalLocation']["floor"]
        x, y = self.data[index]['finalLocation']["pixel_coord"]    

        return [int((x - self.crop_translate_x) * self.resize_scale_x), 
                int((y - self.crop_translate_y) * self.resize_scale_y), 
                floor] 
    

    def create_target(self, index, x, y, floor):

        scan_name = self.data[index]['scanName']
        mesh_conversion =(self.mesh2meters[scan_name][str(floor)]["threeMeterRadius"] / 3.0)*(self.config['conversion_scale'])
        gaussian_target = np.zeros(
            (self.config['max_floors'], self.config['image_size'][1], self.config['image_size'][2])
        )
        gaussian_target[floor, y, x] = 1 # y, x because y -> rows and x -> columns
        gaussian_target[floor, :, :] = gaussian_filter(
            gaussian_target[floor, :, :],
            sigma=(mesh_conversion),
        )
        gaussian_target[floor, :, :] = (
            gaussian_target[floor, :, :]
            / gaussian_target[floor, :, :].sum()
        )
        gaussian_target = torch.tensor(gaussian_target)
        return gaussian_target
        
    def join_dialog(self, index):
        dialogArray = self.data[index]['dialogArray']
        return " ".join(dialogArray)
    
    def visualize_target(self, index):
        x, y, floor = self.scale_location(index)
        
        scan_name = self.data[index]['scanName']
        img = Image.open(f'{self.image_dir}floor_{floor}/{scan_name}_{floor}.png').convert('RGB')

        img_vis = self.preprocess_visualize(img)
        draw = ImageDraw.Draw(img_vis)
        draw.ellipse((x-10, y-10, x+10, y+10), 'red')
        print(self.join_dialog(index))
        img_vis.show()


    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        target_x, target_y, target_floor = self.scale_location(index)
        # maps, conversions = self.gather_correct_floor(index)
        maps, conversions = self.gather_all_floors(index)
        dialogs = clip.tokenize(self.join_dialog(index), truncate=True)
        targets = self.create_target(index, target_x, target_y, target_floor)
        scan_names = self.data[index]['scanName']
        episode_ids = self.data[index]['episodeId']
        true_viewpoints = self.data[index]['finalLocation']['viewPoint']

        return {
            'maps': maps,
            'target_maps': targets, 
            'conversions': conversions,
            'dialogs': dialogs,
            'scan_names': scan_names, 
            'episode_ids': episode_ids,
            'true_viewpoints': true_viewpoints,
        }
            

In [3]:
import torch.nn as nn 
import torch.nn.functional as F 
import torch 

import src.clip as clip 
import src.fusion as fusion
from src.blocks import Up, ConvBlock, IdentityBlock

class LEDModel(nn.Module):
    """ CLIP RN50 with U-Net skip connections """
    def __init__(self, config):
        super(LEDModel, self).__init__()
        self.config = config 
        self.up_factor = 2 if self.config['bilinear'] else 1
        self.clip_rn50, self.preprocess = clip.load("RN50")
        self.clip_rn50 = self.clip_rn50.to(config['device'])

        # Freezing the CLIP model
        for param in self.clip_rn50.parameters():
            param.requires_grad = False

        self._build_decoder()


    def _build_decoder(self):
        # language
        self.lang_fuser1 = fusion.names[self.config['lang_fusion_type']](input_dim=self.config['num_post_clip_channels'] // 2)
        self.lang_fuser2 = fusion.names[self.config['lang_fusion_type']](input_dim=self.config['num_post_clip_channels'] // 4)
        self.lang_fuser3 = fusion.names[self.config['lang_fusion_type']](input_dim=self.config['num_post_clip_channels'] // 8)

        # CLIP encoder output -> 1024
        self.proj_input_dim = 512 if 'word' in self.config['lang_fusion_type'] else 1024
        self.lang_proj1 = nn.Linear(self.proj_input_dim, 1024)
        self.lang_proj2 = nn.Linear(self.proj_input_dim, 512)
        self.lang_proj3 = nn.Linear(self.proj_input_dim, 256)

        # vision
        self.conv1 = nn.Sequential(
            nn.Conv2d(self.config['num_post_clip_channels'], 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.ReLU(True)
        )
        self.up1 = Up(2048, 1024 // self.up_factor, self.config['bilinear'])

        self.up2 = Up(1024, 512 // self.up_factor, self.config['bilinear'])

        self.up3 = Up(512, 256 // self.up_factor, self.config['bilinear'])

        self.layer1 = nn.Sequential(
            ConvBlock(128, [64, 64, 64], kernel_size=3, stride=1, batchnorm=self.config['batch_norm']),
            IdentityBlock(64, [64, 64, 64], kernel_size=3, stride=1, batchnorm=self.config['batch_norm']),
            nn.UpsamplingBilinear2d(scale_factor=2),
        )

        self.layer2 = nn.Sequential(
            ConvBlock(64, [32, 32, 32], kernel_size=3, stride=1, batchnorm=self.config['batch_norm']),
            IdentityBlock(32, [32, 32, 32], kernel_size=3, stride=1, batchnorm=self.config['batch_norm']),
            nn.UpsamplingBilinear2d(scale_factor=2),
        )

        self.layer3 = nn.Sequential(
            ConvBlock(32, [16, 16, 16], kernel_size=3, stride=1, batchnorm=self.config['batch_norm']),
            IdentityBlock(16, [16, 16, 16], kernel_size=3, stride=1, batchnorm=self.config['batch_norm']),
            nn.UpsamplingBilinear2d(scale_factor=1),
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(16, self.config['num_output_channels'], kernel_size=1)
        )

    def encode_image(self, img):
        with torch.no_grad():
            # The default CLIP function has been updated to be able to get intermediate prepools 
            img_encoding, img_im = self.clip_rn50.visual.prepool_im(img)
        return img_encoding, img_im

    def encode_text(self, x):
        with torch.no_grad():
            text_feat = self.clip_rn50.encode_text(x)
            text_feat = torch.repeat_interleave(text_feat, self.config['max_floors'], 0)
        text_mask = torch.where(x.int()==0, x.int(), torch.tensor(1).int().cuda())  # [1, max_token_len]
        return text_feat, text_mask



    def forward(self, x, l):
        B, num_maps, C, H, W = x.size()
        x = x.view(B*num_maps, C, H, W)
        in_type = x.dtype
        in_shape = x.shape
        x = x[:,:3]  # select RGB
        x, im = self.encode_image(x)
        x = x.to(in_type)

        # encode text
        l_enc, l_mask = self.encode_text(l)
        l_input = l_enc
        l_input = l_input.to(dtype=x.dtype)

        # # encode image
        assert x.shape[1] == self.config['num_post_clip_channels']
        # print('after CLIP encoding: ', x.size())
        x = self.conv1(x)

        # print('after convolution after CLIP encoding: ', x.size())


        x = self.lang_fuser1(x, l_input, x2_mask=l_mask, x2_proj=self.lang_proj1)
        # print('after lang_fuser 1: ', x.size())
        x = self.up1(x, im[-2])
        # print('after up after lang_fuser 1: ', x.size())

        x = self.lang_fuser2(x, l_input, x2_mask=l_mask, x2_proj=self.lang_proj2)
        # print('after lang_fuser 2: ', x.size())
        x = self.up2(x, im[-3])
        # print('after up after lang_fuser 2: ', x.size())

        x = self.lang_fuser3(x, l_input, x2_mask=l_mask, x2_proj=self.lang_proj3)
        # print('after lang_fuser 3: ', x.size())
        x = self.up3(x, im[-4])
        # print('after up after lang_fuser 3: ', x.size())

        for enum, layer in enumerate([self.layer1, self.layer2, self.layer3, self.conv2]):
            x = layer(x)
            # print(f'after layer {enum} after all lang_fusions', x.size())
        
        h, w = x.size()[-2], x.size()[-1]
        x = x.squeeze(1)
        x = x.view(B, num_maps, x.size()[-2], x.size()[-1])
        x = F.log_softmax(x.view(B, -1), 1).view(B, num_maps, h, w)
        return x


In [4]:
from tqdm import tqdm 
from src.utils import distance_from_pixels, accuracy
import numpy as np 

def train_model(model, loader, loss_fn, optimizer, scaler, config):
    acc5m = []
    acc3m = []
    acc0m = []
    losses = []
    localization_errors = []
    for enum, data in enumerate(tqdm(loader)):
        optimizer.zero_grad()

        maps = data['maps'].float().to(config['device'])
        target_maps = data['target_maps'].float().to(config['device'])
        conversions = data['conversions'].float()
        dialogs = data['dialogs'].squeeze(1).to(config['device']) # The squeeze removes extra dimension in (BATCH_SIZE, 1, NUM_TOKENS)
        # Data Required to calculate Localization Accuracy (Geodesic)
        scan_names = data['scan_names']
        true_viewpoints = data['true_viewpoints']
        episode_ids = data['episode_ids']
        with torch.autocast('cuda'):
            preds = model(maps, dialogs)
            loss = loss_fn(preds, target_maps)

        scaler.scale(loss).backward()

        scaler.step(optimizer)

        optimizer.step()

        scaler.update()

        le, ep = distance_from_pixels(
            config, preds.detach().cpu(), conversions, scan_names, true_viewpoints, episode_ids, 'train', )
        losses.append(loss.item())
        acc5m.append(accuracy(le, 5))
        acc3m.append(accuracy(le, 3))
        acc0m.append(accuracy(le, 0))
        localization_errors.extend(le)
    return {
        'loss': np.mean(losses),
        'acc5m': np.mean(np.asarray(acc5m)),
        'acc3m': np.mean(np.asarray(acc3m)),
        'acc0m': np.mean(np.asarray(acc0m)),
    }

def eval_model(model, loader, loss_fn, config, mode):
    acc5m = []
    acc3m = []
    acc0m = []
    losses = []
    localization_errors = []
    for enum, data in enumerate(tqdm(loader)):
        maps = data['maps'].float().to(config['device'])
        target_maps = data['target_maps'].float().to(config['device'])
        conversions = data['conversions'].float()
        dialogs = data['dialogs'].squeeze(1).to(config['device']) # The squeeze removes extra dimension in (BATCH_SIZE, 1, NUM_TOKENS)
        # Data Required to calculate Localization Accuracy (Geodesic)
        scan_names = data['scan_names']
        true_viewpoints = data['true_viewpoints']
        episode_ids = data['episode_ids']

        # with torch.autocast('cpu'):
        preds = model(maps, dialogs)
        loss = loss_fn(preds, target_maps)


        le, ep = distance_from_pixels(
            config, preds.detach().cpu(), conversions, scan_names, true_viewpoints, episode_ids, mode)
        losses.append(loss.item())
        acc5m.append(accuracy(le, 5))
        acc3m.append(accuracy(le, 3))
        acc0m.append(accuracy(le, 0))
        localization_errors.extend(le)
    return {
        'loss': np.mean(losses),
        'acc5m': np.mean(np.asarray(acc5m)),
        'acc3m': np.mean(np.asarray(acc3m)),
        'acc0m': np.mean(np.asarray(acc0m)),
    }

In [5]:
# Change this to YAML
config = {
    # Data Paths
    'train_path' : '../../data/way_splits/train_data.json',
    'valid_seen_path' : '../../data/way_splits/valSeen_data.json',
    'valid_unseen_path': '../../data/way_splits/valUnseen_data.json',
    'mesh2meters': '../../data/floorplans/pix2meshDistance.json',
    'image_dir': '../../data/floorplans/',
    'geodistance_file': '../../data/geodistance_nodes.json',
    'save_path': '../../logs/checkpoints',

    'device': 'cuda:0',

    # Hyper Parameters
    'max_floors': 5,

    # Image Parameters
    'image_size': [3, 448, 448],
    # 'image_size': [3, 700, 1200],
    'original_image_size': [3, 700, 1200],
    'cropped_image_size': [3, 700, 800],
    'scaled_image_size': [3, 448, 448],


    'crop_translate_x': 200,
    'crop_translate_y': 0,
    'resize_scale_x': 448/800,
    'resize_scale_y': 448/700,
    'conversion_scale': 448/800,


    'lang_fusion_type': 'mult',
    'num_post_clip_channels': 2048, 
    'bilinear': True,
    'batch_norm': True, 
    'num_output_channels': 1,

    'lr': 0.001,
}



In [6]:
# Training Loop 


def training_loop(train_loader, valid_seen_loader, valid_unseen_loader, epochs, model, loss_fn, optimizer, scaler, scheduler, config):

    # Metrics 
    metrics = {
        'train_loss': 0,
        'valid_seen_loss': 0,
        'valid_unseen_loss': 0,
        'train_acc_5m': 0, 
        'train_acc_3m': 0, 
        'train_acc_0m': 0, 
        'valid_seen_acc_5m': 0, 
        'valid_seen_acc_3m': 0, 
        'valid_seen_acc_0m': 0, 
        'valid_unseen_acc_5m': 0,
        'valid_unsseen_acc_3m': 0,
        'valid_unsseen_acc_0m': 0,
    }
    best_loss = float('inf')
    # Training 
    for e in range(epochs): 

        model.train()
        train_metrics = train_model(model, train_loader, loss_fn, optimizer, scaler, config)
        
        print(f'Train Loss: {train_metrics["loss"]}')
        print(f'Train Acc0m: {train_metrics["acc0m"]}')
        print(f'Train Acc3m: {train_metrics["acc3m"]}')
        print(f'Train Acc5m: {train_metrics["acc5m"]}')
        
        utils.assign_metrics(metrics, train_metrics, 'train')

        model.eval()

        valid_seen_metrics = eval_model(model, valid_seen_loader, loss_fn, config, 'valid_seen')

        print(f'Valid Seen Loss: {valid_seen_metrics["loss"]}')
        print(f'Valid Seen Acc0m: {valid_seen_metrics["acc0m"]}')
        print(f'Valid Seen Acc3m: {valid_seen_metrics["acc3m"]}')
        print(f'Valid Seen Acc5m: {valid_seen_metrics["acc5m"]}')

        utils.assign_metrics(metrics, valid_seen_metrics, 'valid_seen')

        valid_unseen_metrics = eval_model(model, valid_seen_loader, loss_fn, config, 'valid_unseen')

        print(f'Valid Unseen Loss: {valid_seen_metrics["loss"]}')
        print(f'Valid Unseen Acc0m: {valid_seen_metrics["acc0m"]}')
        print(f'Valid Unseen Acc3m: {valid_seen_metrics["acc3m"]}')
        print(f'Valid Unseen Acc5m: {valid_seen_metrics["acc5m"]}')

        utils.assign_metrics(metrics, valid_unseen_metrics, 'valid_unseen')

        print(metrics)

        if metrics['valid_unseen_loss'] < best_loss:
            save_dict = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict()
            }
            save_path = config['save_path'] + f'_epoch_{e}_loss_{metrics["valid_unseen_loss"]}.pth'
            torch.save(save_dict, save_path)
            best_loss = metrics['valid_unseen_loss']
        
        scheduler.step(metrics['valid_unseen_loss'])


In [7]:
train_dataset = LEDDataset(config['train_path'], config['image_dir'], config)
valid_seen_dataset = LEDDataset(config['train_path'], config['image_dir'], config)
valid_unseen_dataset = LEDDataset(config['train_path'], config['image_dir'], config)
print("Created Datasets, Creating DataLoaders")
train_loader = DataLoader(train_dataset, batch_size=3)
valid_seen_loader = DataLoader(valid_seen_dataset, batch_size=6)
valid_unseen_loader = DataLoader(valid_unseen_dataset, batch_size=6)

Created Datasets, Creating DataLoaders


In [8]:
print("Created DataLoaders, Instantiating Model")
led_clip = LEDModel(config)

Created DataLoaders, Instantiating Model


In [9]:
led_clip.to(config['device']); pass

In [10]:
print("Instantiated Model, Configuring Training Parameters")
loss_fn = nn.KLDivLoss(reduction="batchmean")
optimizer = torch.optim.AdamW(led_clip.parameters(), lr=config['lr'], betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
scaler = torch.cuda.amp.GradScaler()

Instantiated Model, Configuring Training Parameters


In [11]:
training_loop(train_loader, valid_seen_loader, valid_unseen_loader, 10, led_clip, loss_fn, optimizer, scaler, scheduler, config) 


  1%|          | 16/1350 [00:30<41:55,  1.89s/it]


IndexError: index 490 is out of bounds for axis 2 with size 448

In [77]:
a = ["some weird text", "some more odd text" ]

In [126]:
tokens = clip.tokenize(a, truncate=True).to('cuda:0')

# text_feat = self.clip_rn50.encode_text(tokens)
# text_feat = torch.repeat_interleave(text_feat, self.num_maps, 0)

text_mask = torch.where(tokens==0, tokens, 1)  # [1, max_token_len]

RuntimeError: expected scalar type int but found long int

In [128]:
a = torch.tensor(1).int().cuda()

1

In [134]:
a_tok.dtype

torch.int32

torch.float32

In [140]:
a_tok

tensor([[-16130,    836,   5613,   4160, -16129,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0],
        [-16130,    836,    750,  11387,   4160, -16129,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,   

In [142]:
torch.where(a_tok==0, a_tok, )

tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]], device='cuda:0', dtype=torch.int32)