In [51]:
# All imports
import argparse
import itertools
from tensorboardX import SummaryWriter
import torch
from torch import nn, optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from tqdm import tqdm
import yaml
from bisect import bisect

# All File Imports
import import_ipynb
from dataset import VisDialDataset
from encoder import LateFusionEncoder
from decoder import DiscriminativeDecoder
from utils.metrics import SparseGTMetrics, NDCG
from model import EncoderDecoderModel
# checkpointing file missing (Resolve later)
from utils.checkpointing import CheckpointManager, load_checkpoint


In [4]:
# Adding arguments
parser = argparse.ArgumentParser()
parser.add_argument(
    "--config-yml",
    default="configs/lf_disc_faster_rcnn_x101.yml",
    help="Path to a config file listing reader, model and solver parameters.",
)
parser.add_argument(
    "--train-json",
    default="data/visdial_1.0_train.json",
    help="Path to json file containing VisDial v1.0 training data.",
)
parser.add_argument(
    "--val-json",
    default="data/visdial_1.0_val.json",
    help="Path to json file containing VisDial v1.0 validation data.",
)
parser.add_argument(
    "--val-dense-json",
    default="data/visdial_1.0_val_dense_annotations.json",
    help="Path to json file containing VisDial v1.0 validation dense ground "
    "truth annotations.",
)


parser.add_argument_group(
    "Arguments independent of experiment reproducibility"
)
parser.add_argument(
    "--gpu-ids",
    nargs="+",
    type=int,
    default=0,
    help="List of ids of GPUs to use.",
)
parser.add_argument(
    "--cpu-workers",
    type=int,
    default=4,
    help="Number of CPU workers for dataloader.",
)
parser.add_argument(
    "--overfit",
    action="store_true",
    help="Overfit model on 5 examples, meant for debugging.",
)
parser.add_argument(
    "--validate",
    action="store_true",
    help="Whether to validate on val split after every epoch.",
)
parser.add_argument(
    "--in-memory",
    action="store_true",
    help="Load the whole dataset and pre-extracted image features in memory. "
    "Use only in presence of large RAM, atleast few tens of GBs.",
)


parser.add_argument_group("Checkpointing related arguments")
parser.add_argument(
    "--save-dirpath",
    default="checkpoints/",
    help="Path of directory to create checkpoint directory and save "
    "checkpoints.",
)
parser.add_argument(
    "--load-pthpath",
    default="",
    help="To continue training, path to .pth file of saved checkpoint.",
)

_StoreAction(option_strings=['--load-pthpath'], dest='load_pthpath', nargs=None, const=None, default='', type=None, choices=None, help='To continue training, path to .pth file of saved checkpoint.', metavar=None)

In [6]:
# Starting the trainning Configuration

torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [16]:
# args code leaving
# Will write later if required
args = parser.parse_args([])
config = yaml.load(open(args.config_yml))

  after removing the cwd from sys.path.


In [29]:
# Adding device code
if isinstance(args.gpu_ids, int):
    args.gpu_ids = [args.gpu_ids]
device = (
    torch.device("cuda", args.gpu_ids[0])
    if args.gpu_ids[0] >= 0
    else torch.device("cpu")
)
torch.cuda.set_device(device)

# Print config and args.
print(yaml.dump(config, default_flow_style=False))
for arg in vars(args):
    print("{:<20}: {}".format(arg, getattr(args, arg)))


dataset:
  concat_history: true
  image_features_test_h5: data/features_faster_rcnn_x101_test.h5
  image_features_train_h5: data/features_faster_rcnn_x101_train.h5
  image_features_val_h5: data/features_faster_rcnn_x101_val.h5
  img_norm: 1
  max_sequence_length: 20
  vocab_min_count: 5
  word_counts_json: data/visdial_1.0_word_counts_train.json
model:
  decoder: disc
  dropout: 0.5
  encoder: lf
  img_feature_size: 2048
  lstm_hidden_size: 512
  lstm_num_layers: 2
  word_embedding_size: 300
solver:
  batch_size: 128
  initial_lr: 0.01
  lr_gamma: 0.1
  lr_milestones:
  - 4
  - 7
  - 10
  num_epochs: 20
  training_splits: train
  warmup_epochs: 1
  warmup_factor: 0.2

config_yml          : configs/lf_disc_faster_rcnn_x101.yml
train_json          : data/visdial_1.0_train.json
val_json            : data/visdial_1.0_val.json
val_dense_json      : data/visdial_1.0_val_dense_annotations.json
gpu_ids             : [0]
cpu_workers         : 4
overfit             : False
validate            : 

### SETTING UP DATASET, DATALOADER, MODEL, CRITERION, OPTIMIZER, SCHEDULER

In [18]:
train_dataset = VisDialDataset(
    config["dataset"],
    args.train_json,
    overfit=args.overfit,
    in_memory=args.in_memory,
    num_workers=args.cpu_workers,
    # Below two lines are added for disc decoder
    return_options=True if config["model"]["decoder"] == "disc" else False,
    add_boundary_toks=False if config["model"]["decoder"] == "disc" else True,
)

  0%|          | 0/376083 [00:00<?, ?it/s]

[train] Tokenizing questions...


100%|██████████| 376083/376083 [00:26<00:00, 14408.26it/s]
  0%|          | 847/337528 [00:00<00:40, 8320.41it/s]

[train] Tokenizing answers...


100%|██████████| 337528/337528 [00:20<00:00, 16489.59it/s]


[train] Tokenizing captions...


100%|██████████| 123287/123287 [00:08<00:00, 15041.58it/s]


In [21]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=config["solver"]["batch_size"],
    num_workers=args.cpu_workers,
    shuffle=True,
)

In [23]:
val_dataset = VisDialDataset(
    config["dataset"],
    args.val_json,
    args.val_dense_json,
    overfit=args.overfit,
    in_memory=args.in_memory,
    num_workers=args.cpu_workers,
    return_options=True,
    # Below line is added for disc decoder
    add_boundary_toks=False if config["model"]["decoder"] == "disc" else True,
)

  4%|▍         | 1724/45238 [00:00<00:02, 17227.32it/s]

[val2018] Tokenizing questions...


100%|██████████| 45238/45238 [00:02<00:00, 20000.75it/s]
  5%|▍         | 1634/34822 [00:00<00:02, 16330.98it/s]

[val2018] Tokenizing answers...


100%|██████████| 34822/34822 [00:01<00:00, 19314.28it/s]
100%|██████████| 2064/2064 [00:00<00:00, 18206.69it/s]


[val2018] Tokenizing captions...


In [26]:
val_dataloader = DataLoader(
    val_dataset,
    batch_size=config["solver"]["batch_size"]
    # Below two lines ll be used, only used for disc decoder
    if config["model"]["decoder"] == "disc"
    else 5,
    num_workers=args.cpu_workers
)

In [27]:
# Passing vocabulary to construct Embedding layer
encoder = LateFusionEncoder(config["model"], train_dataset.vocabulary)
decoder = DiscriminativeDecoder(config["model"], train_dataset.vocabulary)

print("Encoder: {}".format(config["model"]["encoder"]))
print("Decoder: {}".format(config["model"]["decoder"]))

Encoder: lf
Decoder: disc


In [28]:
# Share word embedding b/w encoder & decoder
decoder.word_embed = encoder.word_embed

In [31]:
# Wrapping encoder & decoder model in model to train
model = EncoderDecoderModel(encoder, decoder).to(device)
if -1 not in args.gpu_ids:
    model = nn.DataParallel(model, args.gpu_ids)

# Loss function
if config["model"]["decoder"] == "disc":
    criterion = nn.CrossEntropyLoss()
elif config["model"]["decoder"] == "gen":
    criterion = nn.CrossEntropyLoss(
        ignore_index=train_dataset.vocabulary.PAD_INDEX
    )
else:
    raise NotImplementedError

# Trainning iteration Calculation
if config["solver"]["training_splits"] == "trainval":
    iterations = (len(train_dataset) + len(val_dataset)) // config["solver"]["batch_size"] + 1
else:
    iterations = len(train_dataset) // config["solver"]["batch_size"]

In [32]:
# This function calculates the learning rate multiplier based on the current iteration
# We are doing this because here the learning rate is not constant for the entire duration of trainning

"""
Returns a learning rate multiplier.
Till `warmup_epochs`, learning rate linearly increases to `initial_lr`,
and then gets multiplied by `lr_gamma` every time a milestone is crossed.
"""

"""
Optimizer: It initializes an Adamax optimizer for the model's parameters, using the initial learning rate specified in the configuration.
Scheduler: It sets up a LambdaLR scheduler, which allows specifying a custom learning rate lambda function (lr_lambda) for updating the learning rate based on the optimizer and the defined lr_lambda_fun.
"""

def lr_lambda_fun(current_iteration: int) -> float:
    current_epoch = float(current_iteration) / iterations
    if current_epoch <= config["solver"]["warmup_epochs"]:
        alpha = current_epoch / float(config["solver"]["warmup_epochs"])
        return config["solver"]["warmup_factor"] * (1.0 - alpha) + alpha
    else:
        idx = bisect(config["solver"]["lr_milestones"], current_epoch)
        return pow(config["solver"]["lr_gamma"], idx)

optimizer = optim.Adamax(model.parameters(), lr=config["solver"]["initial_lr"])
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda_fun)

### Setting up Before Trainning Loop

In [47]:
summary_writer = SummaryWriter(log_dir=args.save_dirpath)
checkpoint_manager = CheckpointManager(
    model, optimizer, args.save_dirpath, config=config
)
sparse_metrics = SparseGTMetrics()
ndcg = NDCG()

# Below some code is written to start trainning from the checkpoint path
if args.load_pthpath == "":
    start_epoch = 0
else:
    # Here you can put path to checkpoint
    start_epoch = int(args.load_pthpath.split("_")[-1][:-4])

    model_state_dict, optimizer_state_dict = load_checkpoint(args.load_pthpath)
    if isinstance(model, nn.DataParallel):
        model.module.load_state_dict(model_state_dict)
    else:
        model.load_state_dict(model_state_dict)
    optimizer.load_state_dict(optimizer_state_dict)
    print("Loaded model from {}".format(args.load_pthpath))

### Training Loop

In [55]:
# To keep track of iterations (for tensorboard log)
global_iteration_step = start_epoch * iterations

for epoch in range(start_epoch, config["solver"]["num_epochs"]):

    # At the starting of epoch, Combine dataloaders if training on train + val data
    if config["solver"]["training_splits"] == "trainval":
        combined_dataloader = itertools.chain(train_dataloader, val_dataloader)
    else:
        combined_dataloader = itertools.chain(train_dataloader)
    
    print(f"\nTraining for epoch {epoch}:")
    for i, batch in enumerate(tqdm(combined_dataloader)):
        # Transfer to Device
        for key in batch:
            batch[key] = batch[key].to(device)

        # Zeroing out gradient
        optimizer.zero_grad()
        # Forward Pass
        output = model(batch)
        # Here we are getting the target for the specific train data
        target = (
            batch["ans_ind"]
            if config["model"]["decoder"] == "disc"
            else batch["ans_out"]
        )
        # Loss computation 
        batch_loss = criterion(
            output.view(-1, output.size(-1)), target.view(-1)
        )

        # Backward pass & optimization
        batch_loss.backward()
        optimizer.step()
        
        # Below code is for tensorboard's Summary Writer
        summary_writer.add_scalar(
            "train/loss", batch_loss, global_iteration_step
        )
        summary_writer.add_scalar(
            "train/lr", optimizer.param_groups[0]["lr"], global_iteration_step
        )

        scheduler.step(global_iteration_step)
        global_iteration_step += 1
    
    # This we use to free up cache memory in GPU
    torch.cuda.empty_cache()

        

0it [00:00, ?it/s]


Training for epoch 0:


0it [00:15, ?it/s]


TypeError: view() received an invalid combination of arguments - got (dict, int, int, int), but expected one of:
 * (tuple of ints size)
 * (torch.dtype dtype)
