In [None]:
from data_loader import *
from evaluate_captions import *
import csv
from build_vocab import *
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import time

In [None]:
train_json = './data/annotations/captions_train2014.json'
test_json = './data/annotations/captions_val2014.json'
train_root = './data/images/train/'
test_root = './data/images/test/'
vocab = build_vocab(train_json)
with open('TrainImageIds.csv', 'r') as f:
    reader = csv.reader(f)
    trainIds = list(reader)
trainIds = [int(i) for i in trainIds[0]]
#train_dataset = CocoDataset(train_root, train_json, trainIds, vocab)

valIds = trainIds[-len(trainIds)//5:]
trainIds = trainIds[:-len(trainIds)//5]

with open('TestImageIds.csv', 'r') as f:
    reader = csv.reader(f)
    testIds = list(reader)
testIds = [int(i) for i in testIds[0]]
#test_dataset = CocoDataset(test_root, test_json, testIds, vocab)

In [None]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

tsfm = transforms.Compose([
        transforms.Resize(size=(300,300)),
        transforms.ToTensor(),
        normalize,
    ])
train_loader = get_loader(train_root, train_json, trainIds, vocab, 
                          transform=tsfm, 
                          batch_size=10, 
                          shuffle=True, 
                          num_workers=4)
val_loader = get_loader(train_root, train_json, valIds, vocab, 
                          transform=tsfm, 
                          batch_size=10, 
                          shuffle=True, 
                          num_workers=4)
test_loader = get_loader(test_root, test_json, testIds, vocab, 
                          transform=tsfm, 
                          batch_size=10, 
                          shuffle=True, 
                          num_workers=4)

In [None]:
embed_dim = 100
vocab_size= vocab.idx
hiddem_dim = 100
baseline = model.Img_Caption(encoder=model.res50_encoder(embed_dim), rnn=nn.RNN, 
                             vocab_size=vocab_size, 
                             embed_dim=embed_dim,
                             hidden_dim=hiddem_dim)

optimizer = optim.Adam(baseline.parameters(), lr=1e-4,weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()

In [None]:

use_gpu = torch.cuda.is_available()
if use_gpu:
    baseline = baseline.cuda()


def train(mod, epochs):
    
    best_loss = float('inf')
    for epoch in range(epochs):
        ts = time.time()
        for i, (imgs, caps, lengths) in enumerate(train_loader):
            optimizer.zero_grad()

            if use_gpu:
                imgs = imgs.cuda()# Move your inputs onto the gpu
                caps = caps.cuda()# Move your labels onto the gpu
                #lengths = lengths.cuda()
            
            outputs = mod(imgs, caps, lengths)
            targets = nn.utils.rnn.pack_padded_sequence(caps, lengths, batch_first=True)[0]
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                print("epoch{}, iter{}, loss: {}".format(epoch, iter, loss.item()))
        
        print("Finish epoch {}, time elapsed {}".format(epoch, time.time() - ts))
        # torch.save(fcn_model, 'best_model')

        epoch_loss = val(epoch)
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            torch.save(mod, 'best_model')
        mod.train()

In [None]:
epochs  = 10
train(baseline, epochs)