In [18]:
from torch.utils.data import DataLoader

import numpy as np
import time
import sys
import os
import torch

from models import utils, caption
import spacy
spacy_eng = spacy.load("en")
def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]
from datasets import coco
from configuration import Config
from engine import train_one_epoch, evaluate

In [1]:
import numpy as np
np.long()

0

In [2]:
def main(config):
    device = torch.device(config.device)
    print(f'Initializing Device: {device}')

    seed = config.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)

    model, criterion = caption.build_model(config)
    model.to(device)

    n_parameters = sum(p.numel()
                       for p in model.parameters() if p.requires_grad)
    print(f"Number of params: {n_parameters}")

    param_dicts = [
        {"params": [p for n, p in model.named_parameters(
        ) if "backbone" not in n and p.requires_grad]},
        {
            "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
            "lr": config.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(
        param_dicts, lr=config.lr, weight_decay=config.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.lr_drop)

    dataset_train = coco.build_dataset(config, mode='training')
    dataset_val = coco.build_dataset(config, mode='validation')
    print(f"Train: {len(dataset_train)}")
    print(f"Valid: {len(dataset_val)}")

    sampler_train = torch.utils.data.RandomSampler(dataset_train)
    sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(
        sampler_train, config.batch_size, drop_last=True
    )

    data_loader_train = DataLoader(
        dataset_train, batch_sampler=batch_sampler_train, num_workers=config.num_workers)
    data_loader_val = DataLoader(dataset_val, config.batch_size,
                                 sampler=sampler_val, drop_last=False, num_workers=config.num_workers)

    if os.path.exists(config.checkpoint):
        print("Loading Checkpoint...")
        checkpoint = torch.load(config.checkpoint, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        config.start_epoch = checkpoint['epoch'] + 1

    print("Start Training..")
    for epoch in range(config.start_epoch, config.epochs):
        print(f"Epoch: {epoch}")
        epoch_loss = train_one_epoch(
            model, criterion, data_loader_train, optimizer, device, epoch, config.clip_max_norm)
        lr_scheduler.step()
        print(f"Training Loss: {epoch_loss}")

        torch.save({
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_scheduler': lr_scheduler.state_dict(),
            'epoch': epoch,
        }, config.checkpoint)

        validation_loss = evaluate(model, criterion, data_loader_val, device)
        print(f"Validation Loss: {validation_loss}")

        print()


In [8]:
from configuration import Config
config=Config()
# main(config)

### Dataset Creation

In [48]:
from datasets.utils import read_json
import re
file_name = os.path.join(config.dir, 'annotations', 'captions_val2017.json')
ann = read_json(file_name)

In [49]:
def _process(image_id):
    val = str(image_id).zfill(12)
    return val + '.jpg'

In [50]:
annot = [(_process(val['image_id']), re.sub(' +', ' ', val['caption'].replace(".", " ").replace(",", " ").replace(":", " "))) for val in ann['annotations']]

In [51]:
annot[0]

('000000179765.jpg', 'A black Honda motorcycle parked in front of a garage ')

In [52]:
room_words = ['bedroom', 'room', 'table', 'chair', 'recliner', 'pillow', 'cupboard', 'wardrobe', 'dining', 'sofa', 'kitchen', 'clock', 'tv', 'television', 'curtain', 'telephone', 'kitchen', 'fan', 'lamp', 'carpet', 'beanbag', 'fireplace', 'book', 'bookshelf', 'speaker', 'drape', 'plant', 'pot', 'desk', 'mirror', 'bulb', 'fridge', 'refrigerator', 'bathroom']


In [53]:
i=0
annot_txt = open("val_captions.txt", "w")
for id, caption in annot:
    # if any room_word is present in caption, write it in the file
    for word in room_words:
        if word in caption:
            line = id+ "\t"+  caption
            annot_txt.write(line + "\n")
            break
annot_txt.close()

In [51]:
# Read the file containing all caption-image pairs
with open('train_captions.txt', 'r') as file:
    annotations = file.read()

# Store captions and image names in vectors
all_captions = []
all_imgs = []

# splitting the file contents by line
for annot in annotations.split("\n"):
        # Skip empty lines
        if len(annot)<1:
            continue
        caption = annot.split()[1:]
        try:
            image_id = annot.split()[0]
        except:
            print(image_id, ":", caption)

        all_imgs.append(image_id)
        all_captions.append(caption)

In [56]:
train_ids = sorted(set(all_imgs))
len(train_ids)

40054

In [57]:
# Read the file containing all caption-image pairs
with open('val_captions.txt', 'r') as file:
    annotations = file.read()

# Store captions and image names in vectors
all_captions = []
all_imgs = []

# splitting the file contents by line
for annot in annotations.split("\n"):
        # Skip empty lines
        if len(annot)<1:
            continue
        caption = annot.split()[1:]
        try:
            image_id = annot.split()[0]
        except:
            print(image_id, ":", caption)

        all_imgs.append(image_id)
        all_captions.append(caption)

val_ids = sorted(set(all_imgs))
len(val_ids)

1711

Observation:
- **121508 image caption pairs** for training set with 40054 unique images.
- **5090 image caption** pairs for validation set with 1711 unique images.

In [4]:
import spacy
from torchtext.data import Field
import torch
import numpy as np
from configuration import Config

In [21]:
caption = "The room has a sofa"

In [22]:
# Read the file containing all caption-image pairs
with open('train_captions.txt', 'r') as file:
    annotations = file.read()

# Store captions and image names in vectors
all_words = []

# splitting the file contents by line
for annot in annotations.split("\n"):
        # Skip empty lines
        if len(annot)<1:
            continue
        caption = annot.split()[1:]
        caption = " ".join(caption)
        caption = english.tokenize(caption)
        all_words += caption 

all_words = list(set(all_words))
len(all_words)

13838

In [5]:
spacy_eng = spacy.load("en")
def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]
english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>")

In [24]:
english.build_vocab([all_words])
len(english.vocab.stoi.keys())

13842

In [10]:
torch.save(english, "english.pth")

In [6]:
import torch
english = torch.load('english.pth')

In [15]:
tokens = english.tokenize("Hello this is shailesh's phone")
tokens = ['<sos>'] + tokens + ['<eos>']
numbs = english.numericalize([tokens])

In [26]:
tokens, numbs.T

(['<sos>', 'Hello', 'this', 'is', 'shailesh', "'s", 'phone', '<eos>'],
 tensor([[    2,  1151, 12635,  7719,     0,    13,  9728,     3]]))

In [17]:
toks = numbs.numpy().T[0]
toks

array([    2,  1151, 12635,  7719,     0,    13,  9728,     3])

In [18]:
config=Config()
caption = np.zeros(config.max_position_embeddings)
caption[:len(toks)] = toks
caption

array([2.0000e+00, 1.1510e+03, 1.2635e+04, 7.7190e+03, 0.0000e+00,
       1.3000e+01, 9.7280e+03, 3.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00])

In [28]:
english.vocab.itos[2]

'<sos>'

In [20]:
cap_mask = np.ones(config.max_position_embeddings)
cap_mask[:len(toks)] = 0
cap_mask

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [None]:
##################################

In [4]:
# Code to successfully run coco.build_dataset
import spacy
from configuration import Config
from datasets import coco

config = Config()
spacy_eng = spacy.load("en")
def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]
data = coco.build_dataset(config, mode="training")

In [11]:
image, mask, caption, cap_mask = data[10]

In [25]:
cap_mask

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [None]:
############################################

In [2]:
# Transfer learning

In [1]:
import torch
model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)

Using cache found in /home/shailesh/.cache/torch/hub/facebookresearch_detr_master


In [2]:
torch.save(model.backbone.state_dict(), "pretrained_wts/backbone.pth")

In [1]:
from torch.utils.data import DataLoader

import numpy as np
import time
import sys
import os
import torch

from models import utils, caption
import spacy
spacy_eng = spacy.load("en")
def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]
from datasets import coco
from configuration import Config
from engine import train_one_epoch, evaluate

In [2]:
my_model, _ = caption.build_model(config=Config())

In [3]:
my_model.backbone.load_state_dict(torch.load("pretrained_wts/backbone.pth"), strict = False)

<All keys matched successfully>

In [1]:
import torch
model = torch.hub.load('saahiluppal/catr', 'v2', pretrained=True)
# Use V2 for sine

Using cache found in /home/shailesh/.cache/torch/hub/saahiluppal_catr_master


In [7]:
torch.save(model.transformer.encoder.state_dict(), "pretrained_wts/trans-encoder.pth")

In [9]:
torch.save(model.transformer.decoder.state_dict(), "pretrained_wts/trans-decoder.pth")

In [3]:
torch.save(model.mlp.layers[0].state_dict(), "pretrained_wts/mlp-layer0.pth")
torch.save(model.mlp.layers[1].state_dict(), "pretrained_wts/mlp-layer1.pth")

In [2]:
(model.mlp.layers[0])

3

In [3]:
my_model.backbone.load_state_dict(torch.load("pretrained_wts/others/backbone.pth"), strict = False)

<All keys matched successfully>

In [4]:
my_model.transformer.encoder.load_state_dict(torch.load("pretrained_wts/others/trans-encoder.pth"), strict = False)

<All keys matched successfully>

In [5]:
my_model.transformer.decoder.load_state_dict(torch.load("pretrained_wts/others/trans-decoder.pth"), strict = False)

<All keys matched successfully>

In [3]:
my_model.load_state_dict(torch.load("pretrained_wts/my_model.pth"))

<All keys matched successfully>

In [6]:
my_model.mlp.layers[0].load_state_dict(torch.load("pretrained_wts/mlp-layer0.pth"))

<All keys matched successfully>

In [7]:
my_model.mlp.layers[1].load_state_dict(torch.load("pretrained_wts/mlp-layer1.pth"))

<All keys matched successfully>

In [8]:
torch.save(my_model.state_dict(), "pretrained_wts/my_model.pth")

In [9]:
with open("val_captions.txt") as file:
    text = file.read()

In [10]:
for i, line in enumerate(text.split("\n")):
    if ".jpg" not in line:
        print(i)

In [None]:
# ################
# Model checkpointing notes:
# 1. Manually look for the latest models/
# 2. Set the model number in ("Loading Checkpoints")

In [1]:
with open("train_captions.txt") as file:
    text = file.read()

In [7]:
longest=0
for line in text.split('\n'):
    caption = line.split()[1:]
    caption = " ".join(caption)
    caption = english.tokenize(caption)
    caption = ['<sos>'] + caption + ['<eos>']
    longest = max(len(caption), longest)

In [8]:
longest

52

In [3]:
tokens = english.tokenize("Hello this is shailesh's phone")
tokens = ['<sos>'] + tokens + ['<eos>']
numbs = english.numericalize([tokens])

49

In [1]:
#########################
import torch
from torch.utils.data import DataLoader

import numpy as np
import time
import sys
import os

from models import utils, caption
from configuration import Config
from engine import train_one_epoch, evaluate

import spacy
spacy_eng = spacy.load("en")
def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]
from datasets import coco

In [2]:
config = Config()

In [3]:
device = torch.device(config.device)
print(f'Initializing Device: {device}')
seed = config.seed + utils.get_rank()
torch.manual_seed(seed)
np.random.seed(seed)
model, criterion = caption.build_model(config)
model.load_state_dict(torch.load("pretrained_wts/my_model.pth"))
model.to(device)
print("Model Loaded")
n_parameters = sum(p.numel()
                   for p in model.parameters() if p.requires_grad)
print(f"Number of params: {n_parameters}")
param_dicts = [
    {"params": [p for n, p in model.named_parameters(
    ) if "backbone" not in n and p.requires_grad]},
    {
        "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
        "lr": config.lr_backbone,
    },
]
optimizer = torch.optim.AdamW(
    param_dicts, lr=config.lr, weight_decay=config.weight_decay)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.lr_drop)
    
dataset_train = coco.build_dataset(config, mode='training')
dataset_val = coco.build_dataset(config, mode='validation')
print(f"Train: {len(dataset_train)}")
print(f"Valid: {len(dataset_val)}")
sampler_train = torch.utils.data.RandomSampler(dataset_train)
sampler_val = torch.utils.data.SequentialSampler(dataset_val)
batch_sampler_train = torch.utils.data.BatchSampler(
    sampler_train, config.batch_size, drop_last=True
)
data_loader_train = DataLoader(
    dataset_train, batch_sampler=batch_sampler_train, num_workers=config.num_workers)
data_loader_val = DataLoader(dataset_val, config.batch_size,
                             sampler=sampler_val, drop_last=False, num_workers=config.num_workers)
if os.path.exists(config.checkpoint + "12"):
    print("Loading Checkpoint...")
    checkpoint = torch.load(config.checkpoint + "12", map_location='cpu')
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
    config.start_epoch = checkpoint['epoch'] + 1

Initializing Device: cuda
Model Loaded
Number of params: 52173842
Train: 121086
Valid: 5073
Loading Checkpoint...


In [4]:
print("Loaded Checkpoint:", config.checkpoint + "12")

Loaded Checkpoint: ./checkpoints/checkpoint.pth12


In [5]:
checkpoint['epoch'] + 1

12