# Starting out

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/Kool/Ülikool/'3. aasta'/Lõputöö/data.zip .
!unzip -q data.zip
!rm data.zip

!cp /content/drive/MyDrive/Kool/Ülikool/'3. aasta'/Lõputöö/test_val.zip .
!unzip -q test_val.zip
!rm test_val.zip

In [None]:
!rm -rf line-chart-captioning/
#clone repo
!git clone https://github.com/snemvalts/line-chart-captioning
#clean out data folder and move extracted raw data to captioning
!rm -rf line-chart-captioning/data/*
!mv data/** line-chart-captioning/data/
!mv test_val/validation1 line-chart-captioning/data/figureqa/

Cloning into 'line-chart-captioning'...
remote: Enumerating objects: 129, done.[K
remote: Counting objects: 100% (129/129), done.[K
remote: Compressing objects: 100% (93/93), done.[K
remote: Total 129 (delta 40), reused 101 (delta 22), pack-reused 0[K
Receiving objects: 100% (129/129), 22.30 KiB | 111.00 KiB/s, done.
Resolving deltas: 100% (40/40), done.


In [None]:
import json

# Smoothest and roughest seem to always appear. Same for MIN_AUC and MAX_AUC
question_types = ["GREATER", "LESS", "INTERSECT"]
synthetic_json = {
  "questions": question_types
}

with open('line-chart-captioning/synthetic.json', 'w') as f:
  json.dump(synthetic_json, f)

In [None]:
#VERY IMPORTANT THAT WE DON'T UNROLL
!cd line-chart-captioning && \
python3 src/synthetic/preprocess-question-types.py \
--synthetic-config synthetic.json \
data/figureqa/train1

parsing QA...
parsing annotations...
processing plots...
copying images...
writing csv...


In [None]:
#VERY IMPORTANT THAT WE DON'T UNROLL
!cd line-chart-captioning && \
python3 src/synthetic/preprocess-question-types.py \
--synthetic-config synthetic.json \
data/figureqa/validation1

parsing QA...
parsing annotations...
processing plots...
copying images...
writing csv...


## creating transforms & dataset

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from PIL import Image
from skimage import transform


# adapted from https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
class SyntheticImageDataset(Dataset):
  def __init__(self, csv_file, images_dir, transform=None):
    self.charts_captions = pd.read_csv(csv_file)
    self.images_dir = images_dir
    self.transform = transform

  def __len__(self):
    return len(self.charts_captions)

  def __getitem__(self, idx):
    [image_number, description_blob, all_subjects_blob] = self.charts_captions.iloc[idx]
    image_path = os.path.join(self.images_dir, f'{str(image_number)}.png')
    image = np.array(Image.open(image_path))
    
    sample = {
        'image': image, 
        'description_blob': description_blob,
        'all_subjects_blob': all_subjects_blob   
    }

    if self.transform:
      sample = self.transform(sample)

    return sample

In [None]:
class ResizeImage(object):
  def __init__(self, output_size):
    assert isinstance(output_size, (int, tuple))
    self.output_size = output_size

  def __call__(self, sample):
    resized_image = transform.resize(sample['image'], self.output_size)
    return {**sample, 'image': resized_image}

class StripImageTransparency(object):
  def __init__(self):
    pass

  def __call__(self, sample):
    stripped_transparency_image = sample['image'][:,:,:3]
    return {**sample, 'image': stripped_transparency_image}


class NormalizeImage(object):
  def __init__(self, mean, std):
    self.mean = mean
    self.std = std

  def __call__(self, sample):
    normalized_image = (sample['image'] - self.mean) / self.std
    return {**sample, 'image': normalized_image}


class ImageToTensor(object):
  def __init__(self):
    pass

  def __call__(self, sample):
    image = sample['image'].transpose((2, 0, 1))
    return {**sample, 'image': torch.tensor(image).float()}


class QuestionTypesToOneHotTensor(object):
  def __init__(self):
    pass

  def __call__(self, sample):
    descriptions = json.loads(sample['description_blob'])
    descriptions_present = [description[0] for description in descriptions]
    all_description_types = [0]*(len(question_types))

    for present_description_type in descriptions_present:
      description_type_index = question_types.index(present_description_type)
      all_description_types[description_type_index] = 1

    return {**sample, 'question_types': torch.tensor(all_description_types).float() }

#TODO: how do we encapsulate subjects?

In [None]:
import torch

def get_dataset(images_dir = None, csv_file = None):
  # parallely calculate max caption len and word map to pass it to padcaption
  # cause can't access dataset from transforms before it is defined in compose

  dataset = SyntheticImageDataset(images_dir = images_dir,
                           csv_file = csv_file,
                           transform=transforms.Compose([
                                  ResizeImage((224, 224)),
                                  StripImageTransparency(),
                                  NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                                  ImageToTensor(),
                                  QuestionTypesToOneHotTensor()
                                 ]))

  return dataset


train_dataset = get_dataset(images_dir = 'line-chart-captioning/data/processed_synthetic/train1-types/images',
                      csv_file = 'line-chart-captioning/data/processed_synthetic/train1-types/captions.csv')


validation_dataset = get_dataset(images_dir = 'line-chart-captioning/data/processed_synthetic/validation1-types/images',
                      csv_file = 'line-chart-captioning/data/processed_synthetic/validation1-types/captions.csv')




batch_size=64

train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True)


validation_loader = torch.utils.data.DataLoader(
        validation_dataset,
        batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True)

##  encoder and category model

In [None]:
import torchvision
from torch import nn

class Encoder(nn.Module):
  def __init__(self, encoder_shape):
    super(Encoder, self).__init__()

    base_resnet = torchvision.models.resnet50(pretrained=True) 
    resnet_without_fc = nn.Sequential(*(list(base_resnet.children())[:-2]))

    # freeze weights of resnet 
    for parameter in resnet_without_fc.parameters():
      parameter.requires_grad = False


    self.resnet = resnet_without_fc
    self.pool = nn.AdaptiveAvgPool3d(encoder_shape)

  
  def forward(self, x):
    x = self.resnet(x)
    x = x.squeeze()
    x = self.pool(x)
    x = torch.reshape(x, (-1, 2048*3*3))
    return x

class CategoryModel(nn.Module):
  def __init__(self, encoder_shape, 
               hidden_dim, 
               category_count,
               dropout_p=0.5):
    super(CategoryModel, self).__init__()

    encoder_dim = 1
    for dim in encoder_shape:
      encoder_dim *= dim


    self.hidden1 = nn.Linear(encoder_dim, hidden_dim)
    self.hidden2 = nn.Linear(hidden_dim, hidden_dim)
    self.hidden3 = nn.Linear(hidden_dim, hidden_dim)
    self.hidden4 = nn.Linear(hidden_dim, hidden_dim)
    self.hidden5 = nn.Linear(hidden_dim, hidden_dim)
    self.hidden6 = nn.Linear(hidden_dim, hidden_dim)


    self.output = nn.Linear(hidden_dim, category_count)

    self.relu = nn.ReLU()

    self.softmax = nn.Softmax()
    self.init_weights()

  def init_weights(self):
    for m in [self.hidden1, self.hidden2, self.hidden3, self.hidden4, self.hidden5, self.hidden6]:
      torch.nn.init.xavier_uniform(m.weight)
      m.bias.data.fill_(0.01)

  def forward(self, encoder_out):
    x = self.relu(self.hidden1(encoder_out))
    x = self.relu(self.hidden2(x))
    x = self.relu(self.hidden3(x))

    x = self.relu(self.hidden4(x))
    x = self.relu(self.hidden5(x))
    x = self.relu(self.hidden6(x))

    x = self.output(x)
    x = self.softmax(x)
    return x

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence
from nltk.translate.bleu_score import corpus_bleu
import random

def accuracy(prediction_scores, target, pred_threshold=0.33):
  predictions = torch.gt(prediction_scores, pred_threshold).int()
  correct = torch.eq(predictions, target).sum()
  correct = correct.sum()
  total = target.shape[0] * target.shape[1]
  return (correct/total)*100.0


#Maybe needed
#def calculate_loss(prediction_scores, targets, criterion, pred_threshold=0.5):
#  losses = []
#  for i, prediction in enumerate(prediction_scores):
#    true_prediction_indices = ((prediction >= pred_threshold).nonzero(as_tuple=True))
#    for index in true_prediction_indices:
#      losses.append(criterion(prediction[index], targets[i]))
  
#  return sum(losses)


def train_epoch(iter_cap=50):
  global min_loss

  iter_without_improvement = 0

  for i, batch in enumerate(train_loader):
    imgs = batch['image'].to(device)
    target_types = batch['question_types'].to(device)

    imgs = encoder(imgs)
    prediction_scores = category_model(imgs)

    #print(prediction_scores.shape, target_types.shape)
    # Calculate loss
    loss = criterion(prediction_scores, target_types)

    category_model_optimizer.zero_grad()
    #encoder_optimizer.zero_grad()

    loss.backward()

    category_model_optimizer.step()
    #encoder_optimizer.step()

    loss_score = loss.cpu().detach().numpy()
    if (loss_score <= min_loss):
      min_loss = loss_score
      iter_without_improvement = 0
    else:
      iter_without_improvement += 1

    accuracy_score = accuracy(prediction_scores, target_types, pred_threshold=pred_threshold)

    print(f"Batch #{i}/{len(train_loader)}: Loss is {loss_score:.3f}, Accuracy is {accuracy_score:.1f}%")
    if (i % 15 == 0):
      validate(next(iter(validation_loader)))

    if (iter_without_improvement >= iter_cap):
      print(f"{iter_cap} iterations without improvement. stopping")
      break


def validate(batch):
  global total_count
  global predicted_present_count

  with torch.no_grad():
    imgs = batch['image'].to(device)
    target_types = batch['question_types'].to(device)

    imgs = encoder(imgs)

    category_prediction_scores = category_model(imgs)
    accuracy_score = accuracy(category_prediction_scores, target_types, pred_threshold=pred_threshold)
    print(f"Validation accuracy: {accuracy_score:.1f}%")


encoder_shape = (2048, 3, 3)
hidden_dim = 2048
category_count = len(question_types)
pred_threshold = 1/category_count

category_model_lr = 3e-5
encoder_lr = 3e-4

epochs = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(encoder_shape=encoder_shape).to(device)
#encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
#                                             lr=encoder_lr)


category_model = CategoryModel(encoder_shape, hidden_dim, category_count).to(device)
category_model_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, category_model.parameters()),
                                             lr=category_model_lr)

criterion = nn.BCEWithLogitsLoss().to(device)

min_loss = float('inf')
iter_without_improvement = 0

category_model.train()

for epoch in range(epochs):
  print(f"Epoch {epoch + 1} of {epochs}")
  train_epoch(iter_cap = 300)



Epoch 1 of 5




Batch #0/717: Loss is 0.762, Accuracy is 48.4%
Validation accuracy: 46.9%
Batch #1/717: Loss is 0.761, Accuracy is 53.1%
Batch #2/717: Loss is 0.765, Accuracy is 46.4%
Batch #3/717: Loss is 0.762, Accuracy is 47.9%
Batch #4/717: Loss is 0.765, Accuracy is 45.8%
Batch #5/717: Loss is 0.764, Accuracy is 44.8%
Batch #6/717: Loss is 0.765, Accuracy is 42.7%
Batch #7/717: Loss is 0.756, Accuracy is 47.9%
Batch #8/717: Loss is 0.768, Accuracy is 47.4%
Batch #9/717: Loss is 0.762, Accuracy is 52.6%
Batch #10/717: Loss is 0.758, Accuracy is 55.7%
Batch #11/717: Loss is 0.761, Accuracy is 53.1%
Batch #12/717: Loss is 0.764, Accuracy is 50.5%
Batch #13/717: Loss is 0.764, Accuracy is 46.9%
Batch #14/717: Loss is 0.761, Accuracy is 52.6%
Batch #15/717: Loss is 0.763, Accuracy is 49.0%
Validation accuracy: 45.3%
Batch #16/717: Loss is 0.760, Accuracy is 46.4%
Batch #17/717: Loss is 0.763, Accuracy is 48.4%
Batch #18/717: Loss is 0.756, Accuracy is 50.0%
Batch #19/717: Loss is 0.759, Accuracy is 46

KeyboardInterrupt: ignored

## saving logic

In [None]:
import secrets
identifier_string = secrets.token_hex(nbytes=1)

category_model_name = f'category_model_{identifier_string}_state.pth'
category_encoder_name = f'category_encoder_{identifier_string}_state.pth'


torch.save(category_model.state_dict(), f"/content/drive/MyDrive/Kool/Ülikool/3. aasta/Lõputöö/{category_model_name}")