In [3]:
from utils import load_data, get_passage_spoiler, postprocess_phrase_spoiler, calculate_bleu

from tqdm import tqdm
import pandas as pd

from transformers import QuestionAnsweringPipeline, AutoTokenizer, AutoModelForQuestionAnswering

# Load and Preprocess Data

In [4]:
train = load_data('../data/train.jsonl')
val = load_data('../data/validation.jsonl')

In [5]:
train['targetParagraphs'] = train.targetParagraphs.apply(lambda x: ' '.join(x))
train['postText'] = train.postText.apply(lambda x: x[0].strip())
train['spoiler'] = train.spoiler.apply(lambda x: '\n'.join(x))
train['tags'] = train.tags.apply(lambda x: x[0])
train = train[train.tags == 'passage'][['spoiler', 'postText', 'targetParagraphs', 'tags']]
print(train.shape)
train.head(3)

(1274, 4)


Unnamed: 0,spoiler,postText,targetParagraphs,tags
0,how about that morning we go throw?,"Wes Welker Wanted Dinner With Tom Brady, But P...",It’ll be just like old times this weekend for ...,passage
5,"Apple says that if AirPods are lost or stolen,...",What happens if your new AirPods get lost or s...,One of the biggest surprise announcements at A...,passage
6,"""The more good games I had in them, the more I...",The Reason Why Gabor Kiraly Wears THOSE Tracki...,June 14th 2016 3.3K Shares They may look like ...,passage


In [6]:
val['targetParagraphs'] = val.targetParagraphs.apply(lambda x: ' '.join(x))
val['postText'] = val.postText.apply(lambda x: x[0].strip())
val['spoiler'] = val.spoiler.apply(lambda x: '\n'.join(x))
val['tags'] = val.tags.apply(lambda x: x[0])
val = val[val.tags == 'passage'][['spoiler', 'postText', 'targetParagraphs', 'tags']]
print(val.shape)
val.head(3)

(322, 4)


Unnamed: 0,spoiler,postText,targetParagraphs,tags
0,some of the plot elements are so disturbing th...,Five Nights at Freddy’s Sequel Delayed for Wei...,Five Nights at Freddy’s creator Scott Cawthon ...,passage
4,a man who swallowed a 64GB microSD card and th...,A man swallowed a microSD card and you won't b...,PetaPixel is one of my favorite blogs. The wri...,passage
7,McGonagall was appointed as Dumbledore’s assis...,"You won't believe this stunning ""Harry Potter""...","From reading J.K. Rowling‘s Harry Potter saga,...",passage


In [9]:
train_answers = pd.DataFrame()
train_answers['text'] = train['spoiler']

In [10]:
val_answers = pd.DataFrame()
val_answers['text'] = val['spoiler']

In [11]:
train_contexts = train.targetParagraphs.to_list()
train_questions = train.postText.to_list()
val_contexts = val.targetParagraphs.to_list()
val_questions = val.postText.to_list()

# Load Model and Tokenizer

In [12]:
# Load the pre-trained model
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad", use_fast=False)

## Initialize Question Answering Pipeline

In [13]:
pipeline = QuestionAnsweringPipeline(model, tokenizer)

# Get Predictions for Passage Spoilers

In [14]:
pred_answers = [
    get_passage_spoiler(pipeline=pipeline, question=val_questions[0], context=val_contexts[0], max_loops=10)
    for q, c in tqdm(zip(val_questions, val_contexts))
]

0it [00:00, ?it/s]

AttributeError: 'NumpyOps' object has no attribute 'cblas'

# Evaluate using BLEU Score

In [None]:
val_answers_text = [a['text'] for a in val_answers]
pred_answers_text = [a['answer'] for a in pred_answers]

In [None]:
calculate_bleu(val_answers_text, pred_answers_text)