In [None]:
import sys

IN_COLAB = 'google.colab' in sys.modules
RUN_TRAINING_CELLS = IN_COLAB

EXPERIMENT_NAME = 'SQuAD-Multitask-QuestionAnswer-Generation/'
DRIVE_FOLDER_LOCATION = '/content/drive/My Drive/Colab Notebooks/My Project Folder/' + EXPERIMENT_NAME

In [None]:
# Mounting google drive
if IN_COLAB:
    from google.colab import drive

    drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Environment setup
Setting up Google drive as working directory and installing packages.

In [None]:
# Using my own Google Drive during the experiment to save all checkpoints and training logs.

if IN_COLAB:
    # Adapted from:  https://robertbrucecarter.com/writing/2020/06/setting-your-working-directory-to-google-drive-in-a-colab-notebook/
    import os

    def create_and_set_working_directory(path: str):
        # check if your project folder exists. if not, it will be created.
        if os.path.isdir(path) == False:
            os.mkdir(path)
            print(path + ' did not exist but was created.')

        # change the OS to use your project folder as the working directory
        os.chdir(path)

        print('Working directory changed to: \n' + path)

    create_and_set_working_directory(DRIVE_FOLDER_LOCATION)
    !pwd

Working directory changed to: 
/content/drive/My Drive/Colab Notebooks/My Project Folder/SQuAD-Multitask-QuestionAnswer-Generation/
/content/drive/My Drive/Colab Notebooks/My Project Folder/SQuAD-Multitask-QuestionAnswer-Generation


In [None]:
# Install packages
if IN_COLAB:
    !pip install --quiet transformers==4.3.0
    !pip install --quiet pytorch-lightning==1.2.10
    !pip install --quiet tokenizers==0.10.3

[K     |████████████████████████████████| 1.8 MB 7.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 44.3 MB/s 
[K     |████████████████████████████████| 880 kB 68.8 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 841 kB 8.3 MB/s 
[K     |████████████████████████████████| 176 kB 70.9 MB/s 
[K     |████████████████████████████████| 829 kB 51.7 MB/s 
[K     |████████████████████████████████| 140 kB 70.4 MB/s 
[K     |████████████████████████████████| 596 kB 72.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 58.0 MB/s 
[K     |████████████████████████████████| 271 kB 76.8 MB/s 
[K     |████████████████████████████████| 144 kB 72.2 MB/s 
[K     |████████████████████████████████| 94 kB 4.4 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone


In [None]:
!pip install torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 5.7 MB/s 
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.7 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.12.0+cu113
    Uninstalling torch-1.12.0+cu113:
      Successfully uninstalled torch-1.12.0+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.0
    Uninstalling torchtext-0.13.0:
      Successfully uninstalled torchtext-0.13.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.0+

In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 7.6 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [None]:
# Import packages
from typing import List, Dict
import tqdm.notebook as tq
from tqdm.notebook import tqdm
import json
import pandas as pd
import numpy as np

import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
    )
from transformers import AutoModelWithLMHead, AutoTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
pl.seed_everything(42)

Global seed set to 42


42

### Squad
Reading a version of the SQuAD dataset where the there is a row for each question in the dataset.

In [None]:
# Download squad files
if IN_COLAB:
    !gdown --id 1bJylzAN7ocPTXp_ow-nLE4-hej6c68Vy #train_df.csv
    !gdown --id 1hNJMOTVVKB--btCf3BLPc3frkcppw6fB #dev_df.csv

Downloading...
From: https://drive.google.com/uc?id=1bJylzAN7ocPTXp_ow-nLE4-hej6c68Vy
To: /content/drive/MyDrive/Colab Notebooks/My Project Folder/T5-base-QuestionAnswer-Generation/train_df.csv
100% 90.0M/90.0M [00:00<00:00, 137MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hNJMOTVVKB--btCf3BLPc3frkcppw6fB
To: /content/drive/MyDrive/Colab Notebooks/My Project Folder/T5-base-QuestionAnswer-Generation/dev_df.csv
100% 11.2M/11.2M [00:00<00:00, 211MB/s]


In [None]:
squad_train_df = pd.read_csv('train_df.csv')
print(squad_train_df.shape)

squad_train_df.head()

(87599, 6)


Unnamed: 0,question,context_para,context_sent,answer_text,answer_start,answer_end
0,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...","It is a replica of the grotto at Lourdes, Fran...",Saint Bernadette Soubirous,515,541
1,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",Immediately in front of the Main Building and ...,a copper statue of Christ,188,213
2,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",Next to the Main Building is the Basilica of t...,the Main Building,279,296
3,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...","Immediately behind the basilica is the Grotto,...",a Marian place of prayer and reflection,381,420
4,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",Atop the Main Building's gold dome is a golden...,a golden statue of the Virgin Mary,92,126


In [None]:
squad_dev_df = pd.read_csv('dev_df.csv')
print(squad_dev_df.shape)

squad_dev_df.head()

(10570, 6)


Unnamed: 0,question,context_para,context_sent,answer_text,answer_start,answer_end
0,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,The American Football Conference (AFC) champio...,Denver Broncos,177,191
1,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,The American Football Conference (AFC) champio...,Carolina Panthers,249,266
2,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,"The game was played on February 7, 2016, at Le...","Santa Clara, California",403,426
3,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,The American Football Conference (AFC) champio...,Denver Broncos,177,191
4,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,"As this was the 50th Super Bowl, the league em...",gold,488,492


In [None]:
train_df = pd.read_csv('train.csv',encoding='cp1252')
print(train_df.shape)

train_df.head()

(12, 3)


Unnamed: 0,question,context,answer_text
0,What is pulpitis?,"Pulpitis is the inflammation of the pulp, wher...",Pulpitis is the inflammation of the pulp
1,What is apical periodontitis?,"Pulpitis is the inflammation of the pulp, wher...",apical periodontitis is the inflammation of th...
2,What are the types of inflammation?,"Pulpitis is the inflammation of the pulp, wher...",Inflammation can be acute or chronic
3,What are the symptoms of acute inflammation?,Acute inflammation is characterised by redness...,Symptoms of acute inflammation includes rednes...
4,What is the cause of red and heat in an area o...,Acute inflammation is characterised by redness...,The redness and heat produced in an area of ac...


In [None]:
#Using paragraph
context_name = 'context_para'
drop_context = 'context_sent'

In [None]:
df = squad_train_df.copy()
# print(df.shape, ' :copy')

# df = df.dropna() # One missing answer_text. Will fix it later.
# # print(df.shape, ' :drop na')

# #Dropping duplicates
# # df = df.drop_duplicates(subset=['context_sent']).reset_index(drop=True)
# # print(df.shape, ' :dropping duplicate sentence')

# df.rename(columns = {context_name: 'context'}, inplace=True)
# df.drop(columns=[drop_context, 'answer_start', 'answer_end'], inplace=True) #answer_start and answer_end are not needed and are for the paragraph
# print(df.shape, ' :final')

test_df = df[:11877]
train_df = df[11877:]

## Dev set
dev_df = squad_dev_df.copy()
dev_df.rename(columns = {context_name: 'context'}, inplace=True)
dev_df.drop(columns=[drop_context, 'answer_start', 'answer_end'], inplace=True)

print(train_df.shape, 'train_df')
print(dev_df.shape, 'dev_df')
print(test_df.shape, 'test_df')

train_df.head()

(75722, 6) train_df
(10570, 3) dev_df
(11877, 6) test_df


Unnamed: 0,question,context_para,context_sent,answer_text,answer_start,answer_end
11877,What is heresy mainly at odds with?,Heresy is any provocative belief or theory tha...,Heresy is any provocative belief or theory tha...,established beliefs or customs,77,107
11878,What is a person called is practicing heresy?,Heresy is any provocative belief or theory tha...,A heretic is a proponent of such claims or bel...,A heretic,109,118
11879,What religions and idea of thought is heresy c...,The term is usually used to refer to violation...,It is used in particular in reference to Chris...,"Christianity, Judaism, Islam and Marxism",199,239
11880,What cultures are listed as examples of discip...,"In certain historical Christian, Islamic and J...","In certain historical Christian, Islamic and J...","Christian, Islamic and Jewish",22,51
11881,What language does the term heresy find its ro...,The term heresy is from Greek αἵρεσις original...,The term heresy is from Greek αἵρεσις original...,Greek,24,29


In [None]:
dev_df.iloc[0]['context']

'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.'

In [None]:
SEP_TOKEN = '<sep>'
MASKING_CHANCE = 0.3 #30% chance to replace the answer with '[MASK]'

In [None]:
MODEL_NAME = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
SOURCE_MAX_TOKEN_LEN = 300
TARGET_MAX_TOKEN_LEN = 100

N_EPOCHS = 5
BATCH_SIZE = 1
LEARNING_RATE = 0.0001

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
class QGDataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: tokenizer,
        source_max_token_len: int,
        target_max_token_len: int
        ):

        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        if np.random.rand() > MASKING_CHANCE:
            answer = data_row['answer_text']
        else:
            answer = '[MASK]'

        source_encoding = tokenizer(
            '{} {} {}'.format(answer, SEP_TOKEN, data_row['context']),
            max_length= self.source_max_token_len,
            padding='max_length',
            truncation= True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
            )

        target_encoding = tokenizer(
            '{} {} {}'.format(data_row['answer_text'], SEP_TOKEN, data_row['question']),
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation = True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
            )

        labels = target_encoding['input_ids']
        labels[labels == 0] = -100

        return dict(
            answer_text = data_row['answer_text'],
            context = data_row['context'],
            question = data_row['question'],
            input_ids = source_encoding['input_ids'].flatten(),
            attention_mask = source_encoding['attention_mask'].flatten(),
            labels=labels.flatten()
            )

In [None]:
class QGDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        # test_df: pd.DataFrame,
        tokenizer: tokenizer,
        batch_size,
        source_max_token_len: int,
        target_max_token_len: int
        ):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.val_df = val_df
        # self.test_df = test_df
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self):
        self.train_dataset = QGDataset(self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        self.val_dataset = QGDataset(self.val_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        # self.test_dataset = QGDataset(self.test_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size = self.batch_size, shuffle=True, num_workers = 2)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=1, num_workers=2)

    # def test_dataloader(self):
    #     return DataLoader(self.test_dataset, batch_size=1, num_workers=2)

#### Testing DataModule

In [None]:
# data_module = QGDataModule(train_df, dev_df, test_df, AutoTokenizer, 2, 128, 64)
# data_module.setup()

## Hyperparameters

In [None]:
DF_TAKE_PERCENTAGE = 0.30

TAKE_TRAIN = int(len(train_df) * DF_TAKE_PERCENTAGE)
TAKE_DEV = int(len(dev_df) * DF_TAKE_PERCENTAGE)
TAKE_TEST = int(len(test_df) * DF_TAKE_PERCENTAGE)

print('Taking', DF_TAKE_PERCENTAGE * 100, '%')
print(TAKE_TRAIN, 'of', len(train_df))
print(TAKE_DEV, 'of', len(dev_df))
print(TAKE_TEST, 'of', len(test_df))

Taking 30.0 %
22716 of 75722
3171 of 10570
3563 of 11877


#### Setting DataModule

In [None]:
# print(train_df[:TAKE_TRAIN].shape, dev_df[:TAKE_DEV].shape, test_df[:TAKE_TEST].shape)


print('tokenizer len before: ', len(tokenizer))
tokenizer.add_tokens(SEP_TOKEN)
print('tokenizer len after: ', len(tokenizer))
TOKENIZER_LEN = len(tokenizer)

# data_module = QGDataModule(train_df[:10],train_df[10:], tokenizer, BATCH_SIZE, SOURCE_MAX_TOKEN_LEN, TARGET_MAX_TOKEN_LEN)
# data_module.setup()

tokenizer len before:  32100
tokenizer len after:  32101


#### Setting Model

In [None]:
class QGModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
        self.model.resize_token_embeddings(TOKENIZER_LEN) #resizing after adding new tokens to the tokenizer

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss

    # def test_step(self, batch, batch_idx):
    #     input_ids = batch['input_ids']
    #     attention_mask = batch['attention_mask']
    #     labels = batch['labels']
    #     loss, output = self(input_ids, attention_mask, labels)
    #     self.log('test_loss', loss, prog_bar=True, logger=True)
    #     return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=LEARNING_RATE)

#### Setting trainer

In [None]:
if RUN_TRAINING_CELLS:
    checkpoint_callback = ModelCheckpoint(
        dirpath='checkpoints',
        filename='best-checkpoint',
        save_top_k=-1,
        verbose=True,
        monitor='val_loss',
        mode='min'
    )

In [None]:
if RUN_TRAINING_CELLS:
    trainer = pl.Trainer(
        checkpoint_callback= checkpoint_callback,
        max_epochs=N_EPOCHS,
        gpus=0,
        progress_bar_refresh_rate=1
    )

GPU available: True, used: False
TPU available: False, using: 0 TPU cores


## Training

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./lightning_logs

In [None]:
model = QGModel()
best_model = QGModel.load_from_checkpoint('checkpoints/best-checkpoint-v4.ckpt')

# trainer.fit(model, data_module)

In [None]:
trainer.test()

## Evaluate

### Load model

In [None]:
checkpoint_path = 'checkpoints/best-checkpoint-v5.ckpt'

best_model = QGModel.load_from_checkpoint(checkpoint_path)
best_model.freeze()
best_model.eval()

print()






### Common functions

In [None]:
SEP_TOKEN

'<sep>'

In [None]:
def generate(qgmodel: QGModel, answer: str, context: str) -> str:
    source_encoding = tokenizer(
        '{} {} {}'.format(answer, SEP_TOKEN, context),
        max_length=SOURCE_MAX_TOKEN_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    generated_ids = qgmodel.model.generate(
        input_ids=source_encoding['input_ids'],
        attention_mask=source_encoding['attention_mask'],
        num_beams=1,
        max_length=TARGET_MAX_TOKEN_LEN,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
        use_cache=True
    )

    preds = {
        tokenizer.decode(generated_id, skip_special_tokens=False, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    }

    return ''.join(preds)

In [None]:
def show_result(generated: str, answer: str, context:str, original_question: str = ''):
    print('Generated: ', generated)
    if original_question:
        print('Original : ', original_question)

    print()
    print('Answer: ', answer)
    print('Conext: ', context)
    print('-----------------------------')

### View results manually

In [None]:
sample_question = test_df.iloc[42]

generated = generate(best_model, sample_question['answer_text'], sample_question['context'])
show_result(generated, sample_question['answer_text'], sample_question['context'], sample_question['question'])

Generated:  <pad> 39.1%<sep> What percentage of admissions were accepted under the early action plan?</s>
Original :  What percentage of students at Notre Dame participated in the Early Action program?

Answer:  39.1%
Conext:  Notre Dame is known for its competitive admissions, with the incoming class enrolling in fall 2015 admitting 3,577 from a pool of 18,156 (19.7%). The academic profile of the enrolled class continues to rate among the top 10 to 15 in the nation for national research universities. The university practices a non-restrictive early action policy that allows admitted students to consider admission to Notre Dame as well as any other colleges to which they were accepted. 1,400 of the 3,577 (39.1%) were admitted under the early action plan. Admitted students came from 1,311 high schools and the average student traveled more than 750 miles to Notre Dame, making it arguably the most representative university in the United States. While all entering students begin in the Col

In [None]:
context = 'apical periodontitis is the inflammation of the tissues surrounding the apex of the tooth, including the periodontal ligament and the alveolar bone. Inflammation can be acute or chronic.'
answer = 'apical periodontitis is the inflammation of the tissues surrounding the apex of the tooth'

generated = generate(best_model, answer, context)

show_result(generated, answer, context)

Generated:  <pad> apical periodontitis is the inflammation of the tissues surrounding the opex of the tooth<sep> What is the term for apical periodontiti?</s>

Conext:  apical periodontitis is the inflammation of the tissues surrounding the apex of the tooth, including the periodontal ligament and the alveolar bone. Inflammation can be acute or chronic.
-----------------------------


In [None]:
context = 'Pulpitis is the inflammation of the pulp, whereas apical periodontitis is the inflammation of the tissues surrounding the apex of the tooth, including the periodontal ligament and the alveolar bone. Inflammation can be acute or chronic.'
answer = 'Pulpitis is the inflammation of the pulp'
input_answer = '[MASK]'

generated = generate(best_model, input_answer, context)

show_result(generated, answer, context)

Generated:  <pad> apical periodontitis<sep> What is the inflammation of the tissues surrounding the apex of the tooth?</s>

Conext:  Pulpitis is the inflammation of the pulp, whereas apical periodontitis is the inflammation of the tissues surrounding the apex of the tooth, including the periodontal ligament and the alveolar bone. Inflammation can be acute or chronic.
-----------------------------


In [None]:
context = 'Macrophages are among the main effector cells in chronic inflammation.'
answer = 'Macrophages'
input_answer = '[MASK]'

generated = generate(best_model, input_answer, context)

show_result(generated, answer, context)

Generated:  <pad> Macrophages<sep> What is one of the main effector cells in chronic inflammation?</s>

Conext:  Macrophages are among the main effector cells in chronic inflammation.
-----------------------------


#### Answer-aware question generation

In [None]:
for i in range(len(test_df[:10])):
    context = test_df.iloc[i]['context']
    answer = test_df.iloc[i]['answer_text']

    generated = generate(best_model, answer, context)

    show_result(generated, answer, context, test_df.iloc[i]['question'])

Generated:  <pad> Saint Bernadette Soubirous<sep> Who was the Virgin Mary repute to have appeared in 1858?</s>
Original :  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

Answer:  Saint Bernadette Soubirous
Conext:  Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
-----------------------------
Generated:  <pad> a copper statue

#### Generating both answer and question

In [None]:
for i in range(len(test_df[:10])):
    context = test_df.iloc[i]['context']
    original_answer = test_df.iloc[i]['answer_text']
    input_answer = '[MASK]'

    generated = generate(best_model, input_answer, context)

    show_result(generated, original_answer, context, test_df.iloc[i]['question'])

Generated:  <pad> the Grotto<sep> What is the name of the Marian place of prayer and reflection?</s>
Original :  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

Answer:  Saint Bernadette Soubirous
Conext:  Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
-----------------------------
Generated:  <pad> the Grotto<sep> What is t

# Loading model for evaluation

In [None]:
MODEL_NAME = 't5-small'
SOURCE_MAX_TOKEN_LEN = 300
TARGET_MAX_TOKEN_LEN = 80
SEP_TOKEN = '<sep>'

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_tokens(SEP_TOKEN)
TOKENIZER_LEN = len(tokenizer)

In [None]:
class QGModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
        self.model.resize_token_embeddings(TOKENIZER_LEN) #resizing after adding new tokens to the tokenizer

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=LEARNING_RATE)

In [None]:
checkpoint_path = 'checkpoints/best-checkpoint-v9.ckpt'

best_model = QGModel.load_from_checkpoint(checkpoint_path)
best_model.freeze()
best_model.eval()

print()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242065649.0, style=ProgressStyle(descri…





In [None]:
def generate(qgmodel: QGModel, answer: str, context: str) -> str:
    source_encoding = tokenizer(
        '{} {} {}'.format(answer, SEP_TOKEN, context),
        max_length=SOURCE_MAX_TOKEN_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    generated_ids = qgmodel.model.generate(
        input_ids=source_encoding['input_ids'],
        attention_mask=source_encoding['attention_mask'],
        num_beams=1,
        max_length=TARGET_MAX_TOKEN_LEN,
        repetition_penalty=1.0,
        length_penalty=1.0,
        early_stopping=True,
        use_cache=True
    )

    preds = {
        tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    }

    return ''.join(preds)

In [None]:
def show_result(generated: str, answer: str, context:str, original_question: str = ''):
    print('Generated: ', generated)
    if original_question:
        print('Original : ', original_question)

    print()
    print('Answer: ', answer)
    print('Conext: ', context)
    print('-----------------------------')

In [None]:
sample_question = test_df.iloc[42]

generated = generate(best_model, sample_question['answer_text'], sample_question['context'])
show_result(generated, sample_question['answer_text'], sample_question['context'], sample_question['question'])

Generated:  39.1%<sep> What percentage of admissions were accepted by the early action plan?
Original :  What percentage of students at Notre Dame participated in the Early Action program?

Answer:  39.1%
Conext:  Notre Dame is known for its competitive admissions, with the incoming class enrolling in fall 2015 admitting 3,577 from a pool of 18,156 (19.7%). The academic profile of the enrolled class continues to rate among the top 10 to 15 in the nation for national research universities. The university practices a non-restrictive early action policy that allows admitted students to consider admission to Notre Dame as well as any other colleges to which they were accepted. 1,400 of the 3,577 (39.1%) were admitted under the early action plan. Admitted students came from 1,311 high schools and the average student traveled more than 750 miles to Notre Dame, making it arguably the most representative university in the United States. While all entering students begin in the College of the F