In [None]:
import os
import nltk
import pandas as pd
import numpy as np
import torch

import torch.nn as nn
from datasets import Dataset as ds
from torch.utils.data import Dataset, DataLoader

from transformers import T5Tokenizer
import nbimporter
from custom_t5 import T5ForConditionalGeneration

from rich import box
from rich.table import Column, Table
from rich.console import Console
console = Console(record=True)

training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

device = torch.device("cpu")
pd.set_option('display.max_colwidth', None)

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small", model_max_length=512)

## Fetching data files from S3

In [None]:
df = pd.read_csv("processed-files/text_pairs.csv")[:10000]

In [None]:
df.head()

## Loading T5 Small Pretrained Model

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.15, shuffle=False)
train_df, val_df = train_test_split(train_df, test_size=0.15, shuffle=True)
train_df.shape, test_df.shape, val_df.shape

In [None]:
train_dataset = ds.from_pandas(train_df)
val_dataset = ds.from_pandas(val_df)
test_dataset = ds.from_pandas(test_df)

In [None]:
class CaptionDataset(Dataset):
    def __init__(self, dataset, tokenizer):         
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = 30
  
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, index):
        input_, target_ = self.dataset[index]['descriptions'], self.dataset[index]['captions']

        # tokenize inputs
        tokenized_inputs = self.tokenizer.batch_encode_plus(
            [input_],
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        tokenized_targets = self.tokenizer.batch_encode_plus(
            [target_],
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        input_ids =  tokenized_inputs['input_ids'].squeeze(),
        input_mask = tokenized_inputs['attention_mask'].squeeze(),
        target_ids = tokenized_targets['input_ids'].squeeze(),
        target_mask = tokenized_targets['attention_mask'].squeeze()

        return {
            "input_ids": input_ids,
            "input_mask": input_mask,
            "target_ids": target_ids,
            "target_mask": target_mask,
            "personality": torch.nn.functional.one_hot(torch.arange(0, 217), num_classes=217)[35]
            }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):

    """
    Function to be called for training with the parameters passed from main function

    """

    model.train()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"][0].to(device)
        label = data["personality"].to(device)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["input_ids"][0].to(device)
        mask = data["input_mask"][0].to(device)

        outputs = model(
            personality = label,
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if _ % 500 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [None]:
def validate(epoch, tokenizer, model, device, loader):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'][0].to(device, dtype = torch.long)
          label = data["personality"].to(device)
          ids = data['input_ids'][0].to(device, dtype = torch.long)
          mask = data['input_mask'][0].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=30, 
              num_beams=5,
              temperature=1.8,
              top_k=50,
              top_p=0.95,
              use_cache=True,
              do_sample=True,
              repetition_penalty=2.5, 
              early_stopping=True
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True)for t in y]
          if _%100==0:
              console.print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
  return predictions, actuals


In [None]:
def T5Trainer(model_params, output_dir="./outputs/"):

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seed
    # torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"], model_max_length=512)

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)

    # logging
    console.log(f"[Data]: Reading data...\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = CaptionDataset(
        train_dataset,
        tokenizer
        )
    
    val_set = CaptionDataset(
        val_dataset,
        tokenizer
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    optimizer = torch.optim.AdamW(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    console.log(f"[Saving Model]...\n")
    # Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)

    # evaluating test dataset
    console.log(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
        final_df.to_csv(os.path.join(output_dir, "predictions.csv"))

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.log(f"[Validation Completed.]\n")
    console.print(
        f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
    )
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")


In [None]:
# let's define model parameters specific to T5
model_params = {
    "MODEL": "t5-small",  # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE": 8,  # training batch size
    "VALID_BATCH_SIZE": 8,  # validation batch size
    "TRAIN_EPOCHS": 3,  # number of training epochs
    "VAL_EPOCHS": 1,  # number of validation epochs
    "LEARNING_RATE": 2e-5,  # learning rate
    "SEED": 42,  # set seed for reproducibility
}


In [None]:
T5Trainer(
    model_params=model_params,
    output_dir="outputs",
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
def stylize_text(input_text, tokenizer, model, num_return_sequences):
    batch = tokenizer(input_text, truncation=True, padding='max_length', max_length=40, return_tensors="pt").to(device)
    translated = model.generate(**batch,
                                max_length=25,
                                num_beams=5,
                                num_return_sequences=num_return_sequences,
                                temperature=1.8,
                                top_k=50,
                                top_p=0.95,
                                use_cache=True,
                                do_sample=True,
                                early_stopping=True)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

In [None]:
model_name = 't5-base'
device = torch.device("cuda")
tokenizer = T5Tokenizer.from_pretrained(model_name, model_max_length=512)
stylized_model = T5ForConditionalGeneration.from_pretrained("YFCC-T5-Base").to(device)

In [None]:
import os
from PIL import Image
import random

index = random.choice(range(20000))

folder = "images/train_images"
image_name = test_dataset["img_name"][index]

description = test_dataset["descriptions"][index]
print(f"Factual Image Description :- {description} \n")

preds = stylize_text([description], tokenizer, stylized_model, 5)

for i in range(len(preds)):
    print(f"Stylized Image Captions :- {preds[i]} \n")

Image.open(os.path.join(folder, image_name))