# Mask Filling

In [109]:
from transformers import BartForConditionalGeneration, BartTokenizer

In [110]:
BART_MODEL_NAME = "facebook/bart-large"

In [111]:
model = BartForConditionalGeneration.from_pretrained(BART_MODEL_NAME)

In [112]:
tokenizer = BartTokenizer.from_pretrained(BART_MODEL_NAME)

In [113]:
example_english_phrase = "UN Chief Says There Is No <mask> in Syria"

In [114]:
batch = tokenizer(example_english_phrase, return_tensors='pt')

In [115]:
batch

{'input_ids': tensor([[    0,  4154,  1231, 15674,   345,  1534,   440, 50264,    11,  1854,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [116]:
generated_ids = model.generate(batch['input_ids'])

In [117]:
generated_ids

tensor([[    2,  4154,  2118, 14587, 14704,     2]])

In [118]:
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

['UNALSO SEE']

In [119]:
tokenizer.batch_decode(generated_ids)

['</s>UNALSO SEE</s>']

In [120]:
tokenizer.batch_decode(generated_ids)

['</s>UNALSO SEE</s>']

In [121]:
# ## Sample Code from https://huggingface.co/transformers/model_doc/bart.html
# from transformers import BartForConditionalGeneration, BartTokenizer
# model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", force_bos_token_to_be_generated=True)
# tok = BartTokenizer.from_pretrained("facebook/bart-large")
# example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
# batch = tok(example_english_phrase, return_tensors='pt')
# generated_ids = model.generate(batch['input_ids'])
# assert tok.batch_decode(generated_ids, skip_special_tokens=True) == ['UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria']

# BART Conditional Generation - Text Summarization

In [122]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

In [123]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [124]:
ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."

In [125]:
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [126]:
MAX_LENTH = 5
summary_ids = model.generate(inputs['input_ids'],
                             num_beams=4,
                             max_length=MAX_LENTH,   # max number of words
                             early_stopping=True
                            )
print(len(summary_ids[0]))
print([
    tokenizer.decode(g,
                     skip_special_tokens=True,
                     clean_up_tokenization_spaces=False) for g in summary_ids
])

5
['My friends']


In [127]:
MAX_LENTH = 10
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=MAX_LENTH, early_stopping=True)
print(len(summary_ids[0]))
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

10
['My friends are cool but they eat']


In [128]:
MAX_LENTH = 20
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=MAX_LENTH, early_stopping=True)
print(len(summary_ids[0]))
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

20
["My friends are cool but they eat too many carbs. I'm not a"]


In [129]:
MAX_LENTH = 50
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=MAX_LENTH, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

["My friends are cool but they eat too many carbs. I'm not a big fan of carbs, but they're good for me. I like to eat a lot of carbs. They're not good for my health. I"]


In [130]:
len([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0])

165

In [131]:
len(summary_ids[0])

50

In [132]:
MAX_LENTH = 50
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=MAX_LENTH, early_stopping=True)
print(len(summary_ids[0]))
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

50
["My friends are cool but they eat too many carbs. I'm not a big fan of carbs, but they're good for me. I like to eat a lot of carbs. They're not good for my health. I"]


In [133]:
ARTICLE_TO_SUMMARIZE = "public class int get_my_id:"
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
MAX_LENTH = 50
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=MAX_LENTH, early_stopping=True)
print(len(summary_ids[0]))
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

50
['public class int get_my_id: public class int gets_my-id: int my_id.public class get_ my-id : int myId: int get my- id. public class get my']


In [134]:
ARTICLE_TO_SUMMARIZE = "public class int get_my_id:"
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
MAX_LENTH = 50
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=MAX_LENTH, early_stopping=True)
print(len(summary_ids[0]))
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

50
['public class int get_my_id: public class int gets_my-id: int my_id.public class get_ my-id : int myId: int get my- id. public class get my']


In [144]:
ARTICLE_TO_SUMMARIZE = '''The fake news dataset is one of the classic text analytics datasets available on Kaggle. 
It consists of genuine and fake articles’ titles and text from different authors. In this article, 
I have walked through the entire text classification process using traditional machine learning approaches as well 
as deep learning.'''
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
print(len(inputs['input_ids'][0]))
MIN_LENTH = 20
MAX_LENTH = 50
summary_ids = model.generate(inputs['input_ids'], min_length=10, max_length=MAX_LENTH, early_stopping=True)
print(len(summary_ids[0]))
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

66
50
['The fake news dataset is one of the classic text analytics datasets available on Kaggle. It consists of genuine and fake articles’ titles and text from different authors. I have walked through the entire text classification process using traditional machine learning']


In [146]:
ARTICLE_TO_SUMMARIZE = '''The fake news dataset is one of the classic text analytics datasets available on Kaggle. 
It consists of genuine and fake articles’ titles and text from different authors. In this article, 
I have walked through the entire text classification process using traditional machine learning approaches as well 
as deep learning.'''
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
print(len(inputs['input_ids'][0]))
MIN_LENTH = 20
MAX_LENTH = 40
summary_ids = model.generate(inputs['input_ids'], min_length=10, max_length=MAX_LENTH, early_stopping=True)
print(len(summary_ids[0]))
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

66
40
['The fake news dataset is one of the classic text analytics datasets available on Kaggle. It consists of genuine and fake articles’ titles and text from different authors. In this article']


## Performance comparison of the NLP Summarization pipeline

In [147]:
from transformers import pipeline
summarizer = pipeline("summarization")
summarizer(ARTICLE_TO_SUMMARIZE, max_length=MAX_LENTH, min_length=MIN_LENTH, do_sample=False)



Downloading:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

[{'summary_text': ' Fake news dataset is one of the classic text analytics datasets available on Kaggle . It consists of genuine and fake articles’ titles and text from different authors . In this article,'}]

# Mask Filling

In [85]:
from transformers import BartTokenizer, BartForConditionalGeneration
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
TXT = "My friends are <mask> but they eat too many carbs."

In [86]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

In [87]:
input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']

In [88]:
input_ids

tensor([[    0,  2387,   964,    32, 50264,    53,    51,  3529,   350,   171,
         33237,     4,     2]])

In [89]:
len(input_ids[0])

13

In [90]:
TXT = "I am <mask> but I eat too many carbs."
input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
input_ids

tensor([[    0,   100,   524, 50264,    53,    38,  3529,   350,   171, 33237,
             4,     2]])

In [91]:
TXT = "I am <mask> but I am also a believer."
input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
input_ids

tensor([[    0,   100,   524, 50264,    53,    38,   524,    67,    10, 27644,
             4,     2]])

In [92]:
logits = model(input_ids).logits

In [93]:
logits

tensor([[[ 14.8692,  -0.7673,  12.1319,  ...,  -2.3899,  -2.6394,   6.6846],
         [ 14.8692,  -0.7673,  12.1319,  ...,  -2.3899,  -2.6394,   6.6846],
         [-26.5103,  -4.0979,  -0.5074,  ...,  -2.7472,  -3.3270,  -5.5256],
         ...,
         [-21.8407,  -3.3104,   7.6325,  ...,  -0.9094,  -0.9088,  -1.6164],
         [-29.7491,  -3.8424,  11.8108,  ...,  -1.3566,  -1.9058,   6.4403],
         [ -6.0542,  -4.1361,  19.9523,  ...,  -6.1178,  -5.3846,   0.9493]]],
       grad_fn=<AddBackward0>)

In [94]:
masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()

In [95]:
masked_index

3

In [96]:
probs = logits[0, masked_index].softmax(dim=0)

In [97]:
values, predictions = probs.topk(5) # top 5 predictions

In [98]:
tokenizer.decode(predictions).split()

['a', 'an', 'not', 'Catholic', 'also']

# QA with BART

In [99]:
from transformers import BartTokenizer, BartForQuestionAnswering
import torch

In [100]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForQuestionAnswering.from_pretrained('facebook/bart-large')

Some weights of BartForQuestionAnswering were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [101]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

In [102]:
inputs = tokenizer(question, text, return_tensors='pt')

In [103]:
inputs

{'input_ids': tensor([[    0, 12375,    21,  2488,   289, 13919,   116,     2,     2, 24021,
           289, 13919,    21,    10,  2579, 29771,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:

start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits

# Experimenting with code

In [158]:
java_code = '''public CombinedToken(Integer token, Integer extraToken) {
        this.token = token;
        this.extraToken = extraToken;
    }

    public CombinedToken(String data) {
        this.tokenString = data;
    }'''



In [159]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [161]:
inputs = tokenizer([java_code], return_tensors='pt')

In [162]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [163]:
inputs['input_ids']

tensor([[    0, 15110, 23355, 45643,  1640, 49740, 19233,     6, 47927,  1823,
         45643,    43, 25522, 50118,  1437,  1437,  1437,  1437,  1437,  1437,
          1437,    42,     4, 46657,  5457, 19233,   131, 50118,  1437,  1437,
          1437,  1437,  1437,  1437,  1437,    42,     4, 30842, 45643,  5457,
          1823, 45643,   131, 50118,  1437,  1437,  1437, 35524, 50140,  1437,
          1437,  1437,   285, 23355, 45643,  1640, 34222,   414,    43, 25522,
         50118,  1437,  1437,  1437,  1437,  1437,  1437,  1437,    42,     4,
         46657, 34222,  5457,   414,   131, 50118,  1437,  1437,  1437, 35524,
             2]])

In [164]:
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in inputs['input_ids']])

['public CombinedToken(Integer token, Integer extraToken) {\n        this.token = token;\n        this.extraToken = extraToken;\n    }\n\n    public CombinedToken(String data) {\n        this.tokenString = data;\n    }']
