In [1]:
import numpy as np
import torch
import torch.nn as nn
from torchvision.models import resnet50
import pkbar
from tqdm import tqdm
import json
from model import LSTM
import dataset_factory
import coco_dataset
import vocab
import warnings
warnings.filterwarnings("ignore")

f = open("./default.json")

config = json.load(f)

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


## Prepare Data

In [3]:
vocab_wrapper = vocab.Vocabulary()

In [4]:
coco_train_obj = coco_dataset.CocoDataset(config["dataset"]["images_root_dir"], 
                                          config["dataset"]["training_annotation_file_path"],
                                          config["dataset"]["training_ids_file_path"],
                                          vocab_wrapper,
                                          config["dataset"]["img_size"])

loading annotations into memory...
Done (t=1.06s)
creating index...
index created!


In [5]:
coco_test, vocabulary, train_data_loader, val_data_loader, test_data_loader = dataset_factory.get_datasets(config)

loading annotations into memory...
Done (t=0.81s)
creating index...
index created!
loading annotations into memory...
Done (t=0.61s)
creating index...
index created!
Using the saved vocab.
loading annotations into memory...
Done (t=0.86s)
creating index...
index created!
loading annotations into memory...
Done (t=0.91s)
creating index...
index created!
loading annotations into memory...
Done (t=0.45s)
creating index...
index created!


## ResNet Encoder

In [None]:
def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=config["experiment"]["num_epochs"]):
    # Bar to keep train of training time
    bar = pkbar.Pbar(name="Training in progress.", target=epochs)
    #set model to train mode
    model.train()
    #vars for storing values to return
    best_model = None
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    best_val_acc = None
    #training loop
    for epoch in range(epochs):
        epoch_loss = 0
        train_correct = 0
        val_loss_value = 0
        val_correct = 0
        for data, labels in train_loader:
            #move data to gpu
            data, labels = data.to(device), labels.to(device)
            #zero gradient
            optimizer.zero_grad()
            #forward pass then backprop
            y_hat = model(data)
            loss = criterion(y_hat, labels)
            loss.backward()
            optimizer.step()
            #tally loss
            epoch_loss += loss.item()
            #tally correct
            _, preds = torch.max(y_hat.data, 1)
            train_correct += (preds == labels).sum().item()
        #get validation acc and loss
        #no grad because we don't want to train on it
        with torch.no_grad():
            model.eval()
            for val_data, val_labels in val_loader:
                #move data to gpu
                val_data, val_labels = val_data.to(device), val_labels.to(device)
                #forward pass
                val_y_hat = model(val_data)
                #tally loss
                val_loss = criterion(val_y_hat, val_labels)
                val_loss_value += val_loss.item()
                #tally correct
                _, preds = torch.max(y_hat.data, 1)
                val_correct += (preds == labels).sum().item()

            #deepcopy model with best avg acc on val sets
            last_val_acc = val_correct/len(val_loader.dataset)
            if best_val_acc is None:
                best_val_acc = last_val_acc
                best_model = copy.deepcopy(model)
            elif best_val_acc < last_val_acc:
                best_val_acc = last_val_acc
                best_model = copy.deepcopy(model)
        #record train loss and acc for epoch
        train_accs.append(train_correct/len(train_loader.dataset))
        train_losses.append(epoch_loss/len(train_loader.dataset))
        #append val losses
        val_losses.append(val_loss_value/len(val_loader.dataset))
        val_accs.append(val_correct/len(val_loader.dataset))
        #update loading bar
        bar.update(epoch)
    return train_losses, val_losses, train_accs, val_accs, best_model

In [8]:
resnet = resnet50(pretrained=True)

In [9]:
for param in resnet.parameters():
    param.requires_grad = False
    
in_features = resnet.fc.in_features
resnet.fc = nn.Linear(in_features, config["model"]["hidden_size"])

In [None]:
resnet.to(device);

for data in tqdm(train_data_loader):
    d = data[0].to(device)
    resnet(d)

In [None]:
lstm = LSTM(config["model"]["hidden_size"], config["model"]["neurons"], config["model"]["layers"])
lstm.to(device);

In [None]:
class Model:
    """
    """

    def __init__(self, encoder, decoder):
        self.encoder = encoder
        self.encoder.to(device);

        self.decoder = decoder
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.AdamW(self.decoder.parameters(), lr=config["experiment"]['learning_rate'])
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.8, min_lr=config['learning_rate'] * 0.1, verbose=True)

        self.decoder.to(device);

    def fit(self, x, y, epochs=config["experiment"]["num_epochs"]):
        for epoch in tqdm(range(epochs)):
            print("Training Epoch {}".format(epochs + 1))
            bar = pkbar.Pbar(name='Training in Process', target=epochs)
            
            total_loss = 0
            best_model = None
            train_losses = []
            val_losses = []
            train_accs = []
            val_accs = []
            best_val_acc = None

            # Previous validation loss is the smallest loss
            prev_val_loss = np.min(validation_losses)
            

            # Train
            model.train()
            for i in train_data_loader:
                # Set states to 0 at the beginning of each song
                hidden_state = torch.zeros((config["model"]['layers'], 1, config["model"]['num_neurons'])).to(device)
                hidden_state = hidden_state.float()
                cell_state = torch.zeros((config["model"]['layers'], 1, config["model"]['num_neurons'])).to(device)
                cell_state = cell_state.float()
                hidden = (hidden_state, cell_state)

                loss = 0
                '''
                # Encode the characters to their respective one hot encoding
                dataset = MyDataset(song, config['chunk_length'])
                num_minibatches = len(dataset)
                for i in range(num_minibatches):
                    model.zero_grad()
                    chunk, targets = dataset[i]

                    if len(chunk) == 0:
                        break

                    # Send minibatch to computing device
                    chunk = chunk.to(computing_device)
                    targets = targets.to(computing_device)

                    # Forward pass
                    output, hidden = model(chunk, hidden)

                    # Compute loss
                    targets = targets.argmax(dim=1)
                    loss = criterion(output, targets)
                    song_loss += loss

                    # Backwards pass
                    loss.backward()
                    optimizer.step()

                    # Detach hidden state from LSTM for next TBPTT chunk
                    hidden = (hidden[0].detach(), hidden[1].detach())

                total_loss += song_loss / num_minibatches
            '''
            average_epoch_loss = total_loss / len(train_songs)
            print(f"Epoch {str(epoch + 1)} with training error {str(average_epoch_loss.cpu().item())}")

            
            
            total_loss = 0
            # Validation
            model.eval()
            with torch.no_grad():
                for song in val_songs:
                    # Set states to 0 at the beginning of each song
                    hidden_state = torch.zeros((config['num_layers'], 1, config['num_neurons'])).to(computing_device)
                    hidden_state = hidden_state.float()
                    cell_state = torch.zeros((config['num_layers'], 1, config['num_neurons'])).to(computing_device)
                    cell_state = cell_state.float()
                    hidden = (hidden_state, cell_state)

                    song_loss = 0

                    # Encode the characters to their respective one hot encoding
                    dataset = MyDataset(song, config['chunk_length'])
                    num_minibatches = len(dataset)
                    for i in range(num_minibatches):
                        chunk, targets = dataset[i]
                        if len(chunk) == 0:
                            break

                        # Send minibatch to computing device
                        chunk = chunk.to(computing_device)
                        targets = targets.to(computing_device)

                        # Forward pass
                        output, hidden = model(chunk, hidden)

                        # Compute loss
                        targets = targets.argmax(dim=1)
                        loss = criterion(output, targets)
                        song_loss += loss

                    total_loss += song_loss / num_minibatches

            average_val_epoch_loss = total_loss / len(val_songs)
            print(f"Epoch {str(epoch + 1)} with validation error {str(average_val_epoch_loss.cpu().item())}")

        x.to(device)
        encoder_output = self.encoder(x)  # output of convolutional network