<a href="https://colab.research.google.com/github/sharma01ketan/Hello-World-/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install transformers




In [4]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, BertForNextSentencePrediction

# Load pre-trained BERT models and tokenizers
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
mlm_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
nsp_model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

# Example text for MLM
text = "[CLS] I went to the [MASK] yesterday. [SEP]"
masked_index = 5  # The index of the masked token in the input

# Tokenize input text
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0] * len(tokenized_text)  # All tokens belong to the same sentence

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Predict masked tokens using MLM model
with torch.no_grad():
    outputs = mlm_model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0][0, masked_index].topk(5)  # Get top 5 predictions
    predicted_tokens = tokenizer.convert_ids_to_tokens(predictions.indices.tolist())

print("Masked Language Modeling:")
print("Original Sentence:", text)
print("Predicted Tokens:", predicted_tokens)

# Example text for NSP
text_a = "The sun is shining today."
text_b = "It's a beautiful day."

# Tokenize input texts and pad to the same length
tokenized_text_a = tokenizer.tokenize(text_a)
tokenized_text_b = tokenizer.tokenize(text_b)

# Pad the shorter sequence to match the length of the longer one
max_length = max(len(tokenized_text_a), len(tokenized_text_b))
tokenized_text_a += ['[PAD]'] * (max_length - len(tokenized_text_a))
tokenized_text_b += ['[PAD]'] * (max_length - len(tokenized_text_b))

indexed_tokens_a = tokenizer.convert_tokens_to_ids(tokenized_text_a)
indexed_tokens_b = tokenizer.convert_tokens_to_ids(tokenized_text_b)

# Convert inputs to PyTorch tensors
tokens_tensor_a = torch.tensor([indexed_tokens_a])
tokens_tensor_b = torch.tensor([indexed_tokens_b])

# Predict next sentence using NSP model
with torch.no_grad():
    outputs = nsp_model(tokens_tensor_a, tokens_tensor_b)
    probabilities = torch.softmax(outputs.logits, dim=1)
    is_next = probabilities[0, 0].item() > probabilities[0, 1].item()

print("\nNext Sentence Prediction:")
print("Text A:", text_a)
print("Text B:", text_b)
print("Is Next Sentence:", is_next)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Masked Language Modeling:
Original Sentence: [CLS] I went to the [MASK] yesterday. [SEP]
Predicted Tokens: ['hospital', 'library', 'store', 'office', 'church']

Next Sentence Prediction:
Text A: The sun is shining today.
Text B: It's a beautiful day.
Is Next Sentence: False
