In [None]:
from utils import load_data, improve_question, postprocess_phrase_spoiler, calculate_bleu

from tqdm import tqdm
import pandas as pd

from transformers import QuestionAnsweringPipeline, AutoTokenizer, AutoModelForQuestionAnswering

# Load and Preprocess Data


In [None]:
train = load_data('../data/train.jsonl')
val = load_data('../data/validation.jsonl')

In [None]:
train['targetParagraphs'] = train.targetParagraphs.apply(lambda x: ' '.join(x))
train['postText'] = train.postText.apply(lambda x: x[0].strip())
train['spoiler'] = train.spoiler.apply(lambda x: '\n'.join(x))
train['tags'] = train.tags.apply(lambda x: x[0])
train = train[train.tags == 'phrase'][['spoiler', 'postText', 'targetParagraphs', 'tags', 'spoilerPositions']]
train['spoilerPositions'] = train.spoilerPositions.apply(lambda x: [x[0][0][1], x[0][1][1]])
print(train.shape)
train.head(3)

In [None]:
val['targetParagraphs'] = val.targetParagraphs.apply(lambda x: ' '.join(x))
val['postText'] = val.postText.apply(lambda x: x[0].strip())
val['spoiler'] = val.spoiler.apply(lambda x: '\n'.join(x))
val['tags'] = val.tags.apply(lambda x: x[0])
val = val[val.tags == 'phrase'][['spoiler', 'postText', 'targetParagraphs', 'tags', 'spoilerPositions']]
val['spoilerPositions'] = val.spoilerPositions.apply(lambda x: [x[0][0][1], x[0][1][1]])
print(val.shape)
val.head(3)

In [None]:
train_answers = pd.DataFrame()
train_answers['text'] = train['spoiler']
train_answers['answer_start'] = train.spoilerPositions.apply(lambda x: x[0])
train_answers['answer_end'] = train.spoilerPositions.apply(lambda x: x[1])
train_answers = train_answers.to_dict(orient='records')

In [None]:
val_answers = pd.DataFrame()
val_answers['text'] = val['spoiler']
val_answers['answer_start'] = val.spoilerPositions.apply(lambda x: x[0])
val_answers['answer_end'] = val.spoilerPositions.apply(lambda x: x[1])
val_answers = val_answers.to_dict(orient='records')

In [None]:
train_contexts = train.targetParagraphs.to_list()
train_questions = train.postText.to_list()
val_contexts = val.targetParagraphs.to_list()
val_questions = val.postText.to_list()

## Improve questions

In [None]:
val_questions[0]

In [None]:
improve_question(val_questions[0])

In [None]:
improved_val_questions = [improve_question(q) for q in val_questions]

# Load Model and Tokenizer

In [None]:
import transformers

# Load the pre-trained model
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad", use_fast=False)

## Initialize Question Answering Pipeline

In [None]:
pipeline = QuestionAnsweringPipeline(model, tokenizer)

## Get some examples for predictions with and without preprocessing of clickbaits

In [None]:
pipeline(val_questions[0], val_contexts[0], postprocess=postprocess_phrase_spoiler)

In [None]:
pipeline(improve_question(val_questions[0]), val_contexts[0], postprocess=postprocess_phrase_spoiler)

In [None]:
pred_answers = [
    pipeline(q, c, postprocess=postprocess_phrase_spoiler)
    for q, c in tqdm(zip(improved_val_questions, val_contexts))
]

# Evaluate using BLEU Score

In [37]:
val_answers_text = [a['text'] for a in val_answers]
pred_answers_text = [a['answer'] for a in pred_answers]

In [None]:
calculate_bleu(val_answers_text, pred_answers_text)