# Starting out

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/Kool/Ülikool/'3. aasta'/Lõputöö/data.zip .
!unzip -q data.zip
!rm data.zip

!cp /content/drive/MyDrive/Kool/Ülikool/'3. aasta'/Lõputöö/test_val.zip .
!unzip -q test_val.zip
!rm test_val.zip

In [None]:
!rm -rf line-chart-captioning/
#clone repo
!git clone https://github.com/snemvalts/line-chart-captioning
#clean out data folder and move extracted raw data to captioning
!rm -rf line-chart-captioning/data/*
!mv data/** line-chart-captioning/data/
!mv test_val/** line-chart-captioning/data/

Cloning into 'line-chart-captioning'...
remote: Enumerating objects: 129, done.[K
remote: Counting objects: 100% (129/129), done.[K
remote: Compressing objects: 100% (93/93), done.[K
remote: Total 129 (delta 40), reused 101 (delta 22), pack-reused 0[K
Receiving objects: 100% (129/129), 22.30 KiB | 7.43 MiB/s, done.
Resolving deltas: 100% (40/40), done.


In [None]:
import json

# Smoothest and roughest seem to always appear. Same for MIN_AUC and MAX_AUC
question_types = ["GREATER", "LESS", "INTERSECT"]
synthetic_json = {
  "questions": question_types
}

with open('line-chart-captioning/synthetic.json', 'w') as f:
  json.dump(synthetic_json, f)

In [None]:
!cd line-chart-captioning && \
python3 src/synthetic/preprocess-question-types.py \
--synthetic-config synthetic.json \
--unroll \
data/figureqa/train1

parsing QA...
parsing annotations...
processing plots...
copying images...
writing csv...


## creating transforms & dataset

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from PIL import Image
from skimage import transform


# adapted from https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
class SyntheticImageDataset(Dataset):
  def __init__(self, csv_file, images_dir, transform=None):
    self.charts_captions = pd.read_csv(csv_file)
    self.images_dir = images_dir
    self.transform = transform

  def __len__(self):
    return len(self.charts_captions)

  def __getitem__(self, idx):
    [image_number, description_blob, all_subjects_blob] = self.charts_captions.iloc[idx]
    image_path = os.path.join(self.images_dir, f'{str(image_number)}.png')
    image = np.array(Image.open(image_path))
    
    sample = {
        'image': image, 
        'description_blob': description_blob,
        'all_subjects_blob': all_subjects_blob   
    }

    if self.transform:
      sample = self.transform(sample)

    return sample

In [None]:
class ResizeImage(object):
  def __init__(self, output_size):
    assert isinstance(output_size, (int, tuple))
    self.output_size = output_size

  def __call__(self, sample):
    resized_image = transform.resize(sample['image'], self.output_size)
    return {**sample, 'image': resized_image}

class StripImageTransparency(object):
  def __init__(self):
    pass

  def __call__(self, sample):
    stripped_transparency_image = sample['image'][:,:,:3]
    return {**sample, 'image': stripped_transparency_image}


class NormalizeImage(object):
  def __init__(self, mean, std):
    self.mean = mean
    self.std = std

  def __call__(self, sample):
    normalized_image = (sample['image'] - self.mean) / self.std
    return {**sample, 'image': normalized_image}


class ImageToTensor(object):
  def __init__(self):
    pass

  def __call__(self, sample):
    image = sample['image'].transpose((2, 0, 1))
    return {**sample, 'image': torch.tensor(image).float()}


class FirstQuestionTypeToOneHotTensor(object):
  def __init__(self):
    pass

  def __call__(self, sample):
    descriptions = json.loads(sample['description_blob'])
    first_description = descriptions[0]
    first_description_type = first_description[0]

    all_description_types = [0]*(len(question_types))

    description_type_index = question_types.index(first_description_type)
    all_description_types[description_type_index] = 1

    return {**sample, 'first_description_type': torch.tensor(all_description_types).float() }


class IncludeSubjectLengths(object):
  def __init__(self):
    pass

  def __call__(self, sample):
    all_subjects_len = len(json.loads(sample['all_subjects_blob']))
    return {**sample, 'subject_lengths': all_subjects_len }


class FirstQuestionSubjectsToOneHotTensor(object):
  def __init__(self, max_len):
    self.max_len = max_len
    pass

  def __call__(self, sample):
    descriptions = json.loads(sample['description_blob'])
    all_subjects = json.loads(sample['all_subjects_blob'])

    first_description = descriptions[0]
    first_description_first_subject = first_description[1]
    first_description_second_subject = first_description[2]

    first_subject_onehot = [0]*self.max_len
    first_description_subject_index = all_subjects.index(first_description_first_subject)
    first_subject_onehot[first_description_subject_index] = 1

    second_subject_onehot = [0]*self.max_len
    second_description_subject_index = all_subjects.index(first_description_second_subject)
    second_subject_onehot[second_description_subject_index] = 1

    return {
        **sample, 
        'first_description_subject_onehot': torch.tensor(first_subject_onehot).long(),
        'first_description_subject_index': first_description_subject_index,
        'second_description_subject_onehot': torch.tensor(second_subject_onehot).long(),
        'second_description_subject_index': second_description_subject_index
    }



In [None]:
import torch

def get_dataset(images_dir = None, csv_file = None):
  # parallely calculate max caption len and word map to pass it to padcaption
  # cause can't access dataset from transforms before it is defined in compose

  captions_csv = pd.read_csv(csv_file)
  
  max_subjects_len = captions_csv['all_subjects_blob'].apply(lambda blob: len(json.loads(blob))).max()
  

  dataset = SyntheticImageDataset(images_dir = images_dir,
                           csv_file = csv_file,
                           transform=transforms.Compose([
                                  ResizeImage((224, 224)),
                                  StripImageTransparency(),
                                  NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                                  ImageToTensor(),
                                  FirstQuestionTypeToOneHotTensor(),
                                  FirstQuestionSubjectsToOneHotTensor(max_subjects_len),
                                  IncludeSubjectLengths()
                                 ]))

  return dataset, max_subjects_len


train_dataset, max_subjects_len = get_dataset(images_dir = 'line-chart-captioning/data/processed_synthetic/train1-types/images',
                      csv_file = 'line-chart-captioning/data/processed_synthetic/train1-types/captions.csv')


batch_size=32

train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True)


## encoder and elements model

In [None]:
import torchvision
from torch import nn

class Encoder(nn.Module):
  def __init__(self, encoder_shape):
    super(Encoder, self).__init__()

    base_resnet = torchvision.models.resnet50(pretrained=True) 
    resnet_without_fc = nn.Sequential(*(list(base_resnet.children())[:-2]))

    # freeze weights of resnet 
    for parameter in resnet_without_fc.parameters():
      parameter.requires_grad = False

    self.resnet = resnet_without_fc
    self.pool = nn.AdaptiveAvgPool3d(encoder_shape)

  
  def forward(self, x):
    x = self.resnet(x)
    x = self.pool(x)
    x = torch.reshape(x, (-1, 2048*3*3))
    return x

class SubjectsModel(nn.Module):
  def __init__(self, encoder_shape, 
               category_count,
               subject_max_len, 
               cut_off=True):
    super(SubjectsModel, self).__init__()

    encoder_dim = 1
    for dim in encoder_shape:
      encoder_dim *= dim

    self.question_type_dense = nn.Linear(category_count, 1024)
    self.other_subject_dense = nn.Linear(subject_max_len, 1024)
    self.encoder_dense = nn.Linear(encoder_dim, 1024)

    hidden_dim = 1024 * 3

    self.hidden1 = nn.Linear(hidden_dim, hidden_dim)
    self.hidden2 = nn.Linear(hidden_dim, hidden_dim)
    self.hidden3 = nn.Linear(hidden_dim, hidden_dim)
    self.hidden4 = nn.Linear(hidden_dim, hidden_dim)
    self.hidden5 = nn.Linear(hidden_dim, hidden_dim)

    self.output = nn.Linear(hidden_dim, subject_max_len)

    self.relu = nn.ReLU()

    self.cut_off = cut_off
    self.softmax = nn.Softmax()
    self.init_weights()

  def init_weights(self):
    for m in [self.question_type_dense, self.other_subject_dense, self.encoder_dense,
              self.hidden1, self.hidden2, self.hidden3, self.hidden4, self.hidden5]:
      torch.nn.init.xavier_uniform(m.weight)
      m.bias.data.fill_(0.01)

  def forward(self, encoder_out, question_type_onehot, other_subject_onehot, other_subject_indices=[], subject_lengths=[]):
    batch_size = encoder_out.shape[0]

    x_category = self.question_type_dense(question_type_onehot)
    x_subject = self.other_subject_dense(other_subject_onehot)
    x_encoder = self.encoder_dense(encoder_out)

    x = torch.cat((x_category, x_subject, x_encoder), dim=1)

    x = self.relu(self.hidden1(x))
    x = self.relu(self.hidden2(x))
    x = self.relu(self.hidden3(x))

    x = self.relu(self.hidden4(x))
    x = self.relu(self.hidden5(x))

    x = self.output(x)
    #print("mean x before softmax", x.mean())
    x = self.softmax(x)


    x_cloned = x.clone()
    # if cut off is enabled, use subject lengths to set any unnecessary predictions to 0
    if (self.cut_off):
        assert len(subject_lengths) == batch_size, 'Subject lengths not provided'
        for i, length in enumerate(subject_lengths):
          #print('before', x_cloned[i], 'with length ', length)
          x_cloned[i, length:] = 0.0
          

        # reset the other subject prediction to 0 as well 
        for i, other_subject_index in enumerate(other_subject_indices):
          x_cloned[i][other_subject_index] = 0.0

    return x_cloned

# training the elements model

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence
from nltk.translate.bleu_score import corpus_bleu
import random
import gc

torch.cuda.empty_cache()
gc.collect()

def accuracy(prediction_scores, target):
  predictions = torch.argmax(prediction_scores, dim=1)
  correct = torch.eq(predictions, target).sum()
  correct = correct.sum()
  total = target.shape[0]
  return (correct/total)*100.0


def model_step(model, 
               optimizer, 
               imgs, 
               description_type,
               target_subjects_index,
               opposite_subject_onehot, 
               opposite_subject_index,
               subject_lengths, 
               criterion):
  # note we are providing them cross. So first element model gets second question type 
  subject_prediction_scores = model(imgs, 
                                    description_type, 
                                    opposite_subject_onehot.float(), 
                                    opposite_subject_index, 
                                    subject_lengths=subject_lengths)

  # note we are providing them cross. So first element 
  loss = criterion(subject_prediction_scores, target_subjects_index)

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  return subject_prediction_scores, loss

def train_epoch(iter_cap=50):
  global min_loss
  iter_without_improvement = 0

  for i, batch in enumerate(train_loader):
    torch.cuda.empty_cache()
    gc.collect()

    imgs = batch['image'].to(device)
    description_type = batch['first_description_type'].to(device)
    subject_lengths = batch['subject_lengths']

    first_description_first_subject = batch['first_description_subject_onehot'].to(device)
    first_description_first_subject_index = batch['first_description_subject_index'].to(device)

    first_description_second_subject = batch['second_description_subject_onehot'].to(device)
    first_description_second_subject_index = batch['second_description_subject_index'].to(device)

    imgs = encoder(imgs)

    first_subject_prediction_scores, loss_first = model_step(first_subject_model,
                       first_subject_model_optimizer,
                       imgs,
                       description_type,
                       target_subjects_index=first_description_first_subject_index,
                       opposite_subject_onehot=first_description_second_subject,
                       opposite_subject_index=first_description_second_subject_index,
                       subject_lengths=subject_lengths,
                       criterion=first_criterion
                       )
    
    second_subject_prediction_scores, loss_second = model_step(second_subject_model,
                       second_subject_model_optimizer,
                       imgs,
                       description_type,
                       target_subjects_index=first_description_second_subject_index,
                       opposite_subject_onehot=first_description_first_subject,
                       opposite_subject_index=first_description_first_subject_index,
                       subject_lengths=subject_lengths,
                       criterion=second_criterion
                       )
    
    #encoder_optimizer.zero_grad()
    #encoder_optimizer.step()


    loss_score = (loss_first + loss_second).cpu().detach().numpy()

    if (loss_score <= min_loss):
      min_loss = loss_score
      iter_without_improvement = 0
    else:
      iter_without_improvement += 1

    accuracy_score_first_subject = accuracy(first_subject_prediction_scores, first_description_first_subject_index)
    accuracy_score_second_subject = accuracy(second_subject_prediction_scores, first_description_second_subject_index)

    accuracies[0].append(accuracy_score_first_subject)
    accuracies[1].append(accuracy_score_second_subject)
    losses.append(loss_score)

    print(f"Batch #{i}/{len(train_loader)}: Loss is {loss_score:.3f}")
    print(f"Accuracy for first subject is {accuracy_score_first_subject:.2f}%, for second subject {accuracy_score_second_subject:.2f}%")

    if (iter_without_improvement >= iter_cap):
      print(f"{iter_cap} iterations without improvement. stopping")
      break



encoder_shape = (2048, 3, 3)
category_count = len(question_types)

subjects_model_lr = 6e-5

epochs = 5

accuracies = [[], []]
losses = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(encoder_shape=encoder_shape).to(device)
#encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
#                                             lr=encoder_lr)

first_subject_model = SubjectsModel(encoder_shape, category_count, max_subjects_len).to(device)
first_subject_model_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, first_subject_model.parameters()),
                                             lr=subjects_model_lr)

second_subject_model = SubjectsModel(encoder_shape, category_count, max_subjects_len).to(device)
second_subject_model_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, second_subject_model.parameters()),
                                             lr=subjects_model_lr)

first_criterion = nn.CrossEntropyLoss().to(device)
second_criterion = nn.CrossEntropyLoss().to(device)

min_loss = float('inf')
iter_without_improvement = 0

first_subject_model.train()
second_subject_model.train()

for epoch in range(epochs):
  print(f"Epoch {epoch + 1} of {epochs}")
  train_epoch(iter_cap = 300)

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth


HBox(children=(FloatProgress(value=0.0, max=102502400.0), HTML(value='')))






Epoch 1 of 5




Batch #0/1433: Loss is 3.764
Accuracy for first subject is 43.75%, for second subject 37.50%
Batch #1/1433: Loss is 3.755
Accuracy for first subject is 37.50%, for second subject 31.25%
Batch #2/1433: Loss is 3.758
Accuracy for first subject is 28.12%, for second subject 31.25%
Batch #3/1433: Loss is 3.735
Accuracy for first subject is 31.25%, for second subject 37.50%
Batch #4/1433: Loss is 3.752
Accuracy for first subject is 28.12%, for second subject 31.25%
Batch #5/1433: Loss is 3.861
Accuracy for first subject is 18.75%, for second subject 18.75%
Batch #6/1433: Loss is 3.758
Accuracy for first subject is 37.50%, for second subject 21.88%
Batch #7/1433: Loss is 3.745
Accuracy for first subject is 34.38%, for second subject 34.38%
Batch #8/1433: Loss is 3.765
Accuracy for first subject is 34.38%, for second subject 15.62%
Batch #9/1433: Loss is 3.726
Accuracy for first subject is 34.38%, for second subject 25.00%
Batch #10/1433: Loss is 3.524
Accuracy for first subject is 37.50%, fo

In [None]:
#losses = [loss.item() for loss in losses]
#accuracies = [[accuracy[0].cpu().numpy(), accuracy[1].cpu().numpy()] for accuracy in accuracies]

from matplotlib.pyplot import plot

plt.xlabel('batch', fontsize=12)
plt.ylabel('loss', fontsize=12)

plot(list(range(len(losses))), losses)

## saving logic

In [None]:
import secrets
identifier_string = secrets.token_hex(nbytes=1)

first_subject_model_name = f'subject_model_first_{identifier_string}_state.pth'
second_subject_model_name = f'subject_model_second_{identifier_string}_state.pth'

torch.save(first_subject_model.state_dict(), f"/content/drive/MyDrive/Kool/Ülikool/3. aasta/Lõputöö/{first_subject_model_name}")
torch.save(second_subject_model.state_dict(), f"/content/drive/MyDrive/Kool/Ülikool/3. aasta/Lõputöö/{second_subject_model_name}")