In [11]:
# Cell 1: Import libraries and create output directory
from transformers import BertTokenizer, BertModel
import torch
import os

# Create output directory to store results
os.makedirs('outputs', exist_ok=True)

print("Libraries imported and outputs directory created.")

Libraries imported and outputs directory created.


In [12]:
# Cell 2: Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

print("BERT tokenizer and model loaded successfully.")

BERT tokenizer and model loaded successfully.


In [13]:
# Cell 3: Define input text and tokenize
text = "This is a sample text for tokenization and encoding."

inputs = tokenizer(
    text,
    return_tensors="pt",  # Return PyTorch tensors
    padding=True,         # Add padding if needed
    truncation=True,      # Truncate if text exceeds max length
    max_length=128        # Set max token length
)

print("Tokenized Text (Input IDs):", inputs['input_ids'])
print("Attention Mask:", inputs['attention_mask'])
print("Token Type IDs:", inputs['token_type_ids'])

Tokenized Text (Input IDs): tensor([[  101,  2023,  2003,  1037,  7099,  3793,  2005, 19204,  3989,  1998,
         17181,  1012,   102]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Token Type IDs: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [14]:
# Cell 4: Encode the text to get embeddings
model.eval()  # Set model to evaluation mode
with torch.no_grad():  # Disable gradient computation for inference
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state  # Shape: [batch_size, seq_length, hidden_size]

print("Embeddings Shape:", embeddings.shape)

Embeddings Shape: torch.Size([1, 13, 768])


In [15]:
# Cell 5: Save tokenized inputs and embeddings
torch.save(inputs, 'outputs/task1_inputs.pt')
torch.save(embeddings, 'outputs/task1_embeddings.pt')

print("Tokenized inputs and embeddings saved to outputs folder.")

Tokenized inputs and embeddings saved to outputs folder.


In [16]:
# Cell 6: Decode tokens and save summary
decoded_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print("Decoded Tokens:", decoded_tokens)

with open('outputs/task1_summary.txt', 'w') as f:
    f.write(f"Input Text: {text}\n")
    f.write(f"Tokenized Input IDs: {inputs['input_ids'].tolist()}\n")
    f.write(f"Decoded Tokens: {decoded_tokens}\n")
    f.write(f"Embeddings Shape: {embeddings.shape}\n")

print("Summary saved to task1_summary.txt.")

Decoded Tokens: ['[CLS]', 'this', 'is', 'a', 'sample', 'text', 'for', 'token', '##ization', 'and', 'encoding', '.', '[SEP]']
Summary saved to task1_summary.txt.


In [23]:

# Cell 7: Verify loaded files
import torch
from transformers.tokenization_utils_base import BatchEncoding
import torch.serialization

# Add the BatchEncoding class to the safe globals list
torch.serialization.add_safe_globals([BatchEncoding])

# Now load the files
inputs = torch.load('outputs/task1_inputs.pt')
embeddings = torch.load('outputs/task1_embeddings.pt')

print("Loaded Inputs:", inputs)
print("Loaded Embeddings Shape:", embeddings.shape)

# Alternative approach (if the above doesn't work):
# inputs = torch.load('outputs/task1_inputs.pt', weights_only=False)
# embeddings = torch.load('outputs/task1_embeddings.pt', weights_only=False)
# Note: Only use weights_only=False if you trust the source of these files

Loaded Inputs: {'input_ids': tensor([[  101,  2023,  2003,  1037,  7099,  3793,  2005, 19204,  3989,  1998,
         17181,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Loaded Embeddings Shape: torch.Size([1, 13, 768])
