In [None]:
pip install transformers

In [None]:
pip install datasets

In [None]:
pip install KeywordMasking

In [None]:
from transformers import AutoTokenizer
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [12]:
actual_text = 'Whoever wrote the screenplay for this movie obviously never consulted any books about Lucille Ball, especially her autobiography. I\'ve never seen \
              so many mistakes in a biopic, ranging from her early years in Celoron and Jamestown to her later years with Desi. I could write a whole list of \
              factual errors, but it would go on for pages. In all, I believe that Lucille Ball is one of those inimitable people who simply cannot be portrayed \
              by anyone other than themselves. If I were Lucie Arnaz and Desi, Jr., I would be irate at how many mistakes were made in this film. \
              The filmmakers tried hard, but the movie seems awfully sloppy to me.'

In [13]:
# Summarizing the actual text

from transformers import pipeline

pipe = pipeline("summarization", model="facebook/bart-large-cnn", max_length=1024, device=0)
pipe_out = pipe(actual_text)
summarized_text = pipe_out[0]['summary_text']
print()
print(f'Summarized text: {summarized_text}')

Your max_length is set to 1024, but you input_length is only 200. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=100)



Summarized text: I've never seen so many mistakes in a biopic, ranging from her early years in Celoron and Jamestown to her later years with Desi. Whoever wrote the screenplay for this movie obviously never consulted any books about Lucille Ball, especially her autobiography. The filmmakers tried hard, but the movie seems awfully sloppy to me.


In [14]:
from KeywordMasking import DataCollatorForKeywordMasking

example_keywords = ['mistakes', 'screenplay', 'consulted', 'Lucille Ball', 'filmmakers', 'sloppy']
data_collator = DataCollatorForKeywordMasking(tokenizer=tokenizer,
                                              list_of_keywords=example_keywords,
                                              mlm_probability=0.75,
                                              return_tensors='np')

In [17]:
import pandas as pd

inputs = tokenizer(summarized_text, return_tensors='np')
outputs = data_collator([{"input_ids": inputs["input_ids"][0]}])
pd.DataFrame({
    "original tokens": tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]),
    "masked tokens": tokenizer.convert_ids_to_tokens(outputs["input_ids"][0]),
    "original input ids": inputs["input_ids"][0],
    "masked input ids": outputs["input_ids"][0],
    "labels": outputs["labels"][0]
}).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59,60,61,62,63,64,65,66,67,68
original tokens,[CLS],i,',ve,never,seen,so,many,mistakes,in,...,the,movie,seems,awful,##ly,sloppy,to,me,.,[SEP]
masked tokens,[CLS],i,',ve,never,seen,so,many,[MASK],in,...,the,movie,seems,awful,##ly,[MASK],to,me,.,[SEP]
original input ids,101,1045,1005,2310,2196,2464,2061,2116,12051,1999,...,1996,3185,3849,9643,2135,28810,2000,2033,1012,102
masked input ids,101,1045,1005,2310,2196,2464,2061,2116,103,1999,...,1996,3185,3849,9643,2135,103,2000,2033,1012,102
labels,-100,-100,-100,-100,-100,-100,-100,-100,12051,-100,...,-100,-100,-100,-100,-100,28810,-100,-100,-100,-100
