In [6]:
import torch
from torch.nn import functional as F

from transformers import (BertTokenizer, BertForMaskedLM, BertForNextSentencePrediction,
                        AutoModelForQuestionAnswering,AutoTokenizer, AutoModelForSeq2SeqLM, 
                        LEDTokenizer, LEDForConditionalGeneration)

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Encoding

In [8]:
text = '08:59: waiting for my team to join the call'
encoding = tokenizer.encode_plus(text, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
encoding

{'input_ids': tensor([[ 101, 5511, 1024, 5354, 1024, 3403, 2005, 2026, 2136, 2000, 3693, 1996,
         2655,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,

## Masked

In [9]:
masked_model = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict = True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
text = "The Opera House in Australia is in , " + tokenizer.mask_token + " city"

input = tokenizer.encode_plus(text, return_tensors = "pt")
mask_index = torch.where(input["input_ids"][0] == tokenizer.mask_token_id)

In [11]:
output = masked_model(**input)
softmax = F.softmax(output.logits, dim = -1)
mask_word = softmax[0, mask_index, :]
top_10 = torch.topk(mask_word, 3, dim = 1)[1][0]

In [12]:
for token in top_10:
   word = tokenizer.decode([token])
   new_sentence = text.replace(tokenizer.mask_token, word)
   print(new_sentence)

The Opera House in Australia is in , sydney city
The Opera House in Australia is in , melbourne city
The Opera House in Australia is in , brisbane city


## Next sentence prediction

In [13]:
nsp_model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

In [14]:
prompt = "Incredible journey, Isha! Your dedication and teamwork shine through this experience."

next_sentence = "It's inspiring to see how you're using technology for such a meaningful cause."

encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
outputs = nsp_model(**encoding)[0]
F.softmax(outputs, dim = 1)

tensor([[9.9999e-01, 8.2553e-06]], grad_fn=<SoftmaxBackward0>)

In [15]:
prompt = "Incredible journey, Isha! Your dedication and teamwork shine through this experience."

next_sentence = "80% of chronic diseases are preventable. "

encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
outputs = nsp_model(**encoding)[0]
F.softmax(outputs, dim = 1)

tensor([[6.3895e-05, 9.9994e-01]], grad_fn=<SoftmaxBackward0>)

## Question Answer
### barely acceptable 

In [16]:
model_name = "deepset/bert-base-cased-squad2"
qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_tokeniser = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [17]:
context = "My name is Clara and I live in Berkeley."

question = "Where do I live?"

# We can use our tokenizer to automatically generate 2 sentence by passing the
# two sequences to tokenizer as two arguments
tokenized_inputs = qa_tokeniser(question, context, return_tensors="pt")
tokenized_inputs

with torch.no_grad():
    outputs = qa_model(**tokenized_inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

''' start_logits (torch.FloatTensor of shape (batch_size, sequence_length)) — Span-start scores (before SoftMax).

end_logits (torch.FloatTensor of shape (batch_size, sequence_length)) — Span-end scores (before SoftMax). '''

predict_answer_tokens = tokenized_inputs.input_ids[0, answer_start_index : answer_end_index + 1]
qa_tokeniser.decode(predict_answer_tokens)
     
     

'Berkeley'

## Bert Text Generation (Dual Bert Architecture)
### Don't Use it


In [None]:
b2b_tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
model = AutoModelForSeq2SeqLM.from_pretrained("google/roberta2roberta_L-24_discofuse")

config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/846k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

In [None]:
discofuse = """As a run-blocker, Zeitler moves relatively well. Zeitler often struggles at the point of contact in space."""

input_ids = b2b_tokenizer(discofuse, return_tensors="pt").input_ids
output_ids = model.generate(input_ids)[0]
print(tokenizer.decode(output_ids, skip_special_tokens=True))