In [1]:
import itertools
import torch
import wandb

from transformers import DistilBertTokenizer

import sys
sys.path.append('../src')
from data_utils import preprocess, make_train_valid_dfs, build_loaders
from config import default_config
from clip_utils import CLIPModel
from train_eval_utils import train_epoch, valid_epoch

In [2]:
project_name = 'image-captioning-CLIP'
exp_name = 'exp_1a'
config = default_config
device = "cpu"
print(f'{project_name=}\n{exp_name=}\n{device=}')
print(f'{config=}')

project_name='image-captioning-CLIP'
exp_name='exp_1a'
device='cpu'
config={'raw_file_path': '../input/raw/flickr30k/results.csv', 'clean_file_path': '../input/clean/flickr30k/captions.csv', 'image_path': '../input/raw/flickr30k/Images', 'train_size': 0.8, 'batch_size': 32, 'num_workers': 4, 'image_encoder_lr': 0.0001, 'text_encoder_lr': 1e-05, 'projection_head_lr': 0.001, 'weight_decay': 0.001, 'patience': 1, 'factor': 0.8, 'epochs': 2, 'device': 'cuda:0', 'image_size': 224, 'text_tokenizer': 'distilbert-base-uncased', 'max_length': 200, 'image_encoder': 'resnet50', 'text_encoder': 'distilbert-base-uncased', 'pretrained': True, 'trainable': True, 'image_embedding': 2048, 'text_embedding': 768, 'projection_dim': 256, 'dropout': 0.1, 'temperature': 1}


In [3]:
preprocess(config['raw_file_path'], 1)
train_df, valid_df = make_train_valid_dfs(config["clean_file_path"], 0.8)
train_df, valid_df = train_df[:128], valid_df[:64]
tokenizer = DistilBertTokenizer.from_pretrained(config['text_tokenizer'])
train_loader = build_loaders(train_df, tokenizer, mode="train", config=config)
valid_loader = build_loaders(valid_df, tokenizer, mode="train", config=config)

Created clean csv file ../input/clean/flickr30k/captions.csv


In [4]:
run = wandb.init()
artifact = run.use_artifact(f'richzhu/{project_name}/{exp_name}:latest',
                            type='model')
artifact_dir = artifact.download()
run.finish()

run = wandb.init(project='image-captioning-CLIP', config=config)

[34m[1mwandb[0m: Currently logged in as: [33mrichzhu[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact exp_1a:latest, 346.44MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.2


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016723432633333365, max=1.0…

In [5]:
model = CLIPModel(config)
model.load_state_dict(torch.load(f'{artifact_dir}/{exp_name}.pt'))
model.to(device)

params = [
    {"params": model.image_encoder.parameters(),
     "lr": config['image_encoder_lr']},
    {"params": model.text_encoder.parameters(),
     "lr": config['text_encoder_lr']},
    {"params": itertools.chain(
        model.image_projection.parameters(), model.text_projection.parameters()
    ), "lr": config['projection_head_lr'],
        "weight_decay": config["weight_decay"]
    }
]
optimizer = torch.optim.AdamW(params, weight_decay=0.)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", patience=config['patience'], factor=config['factor']
)
step = "epoch"

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
best_loss = float('inf')
for epoch in range(config['epochs']):
    print(f"Epoch: {epoch + 1}")
    model.train()
    train_loss = train_epoch(model, train_loader, optimizer, lr_scheduler,
                             step, device)
    model.eval()
    with torch.no_grad():
        valid_loss = valid_epoch(model, valid_loader, device)

    if valid_loss.avg < best_loss:
        best_loss = valid_loss.avg
        torch.save(model.state_dict(), f'../models/{exp_name}.pt')
        artifact = wandb.Artifact(exp_name, type='model')
        artifact.add_file(f'../models/{exp_name}.pt')
        run.log_artifact(artifact)
        print("Saved Best Model!")

    lr_scheduler.step(valid_loss.avg)

run.finish()

Epoch: 1


  0%|          | 0/4 [00:00<?, ?it/s]

[W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.


  0%|          | 0/2 [00:00<?, ?it/s]

Saved Best Model!
Epoch: 2


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Saved Best Model!
