In [1]:
# Import necessary libraries
from transformers import AutoTokenizer

## Basic Tokenization Using Python's Split

In [2]:
text = "Hello, world! This is a test."
print("Original text:", text)
# Using Python's split method to tokenize the sentence based on spaces
tokens = text.split()
print("Basic split tokens:", tokens)
# Output: ['Hello,', 'world!', 'This', 'is', 'a', 'test.']
print("\n")  # For clarity in output

Original text: Hello, world! This is a test.
Basic split tokens: ['Hello,', 'world!', 'This', 'is', 'a', 'test.']




## Tokenization Using a Pre-trained BERT Tokenizer

In [3]:
# Load a pre-trained tokenizer (e.g., BERT base uncased)
print("Loading pre-trained BERT tokenizer (bert-base-uncased)...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize a sentence
text = "Hello, how are you?"
print(f"Original text: '{text}'")
tokens = tokenizer.tokenize(text)
print("Tokens from BERT tokenizer:", tokens)
# Output: ['hello', ',', 'how', 'are', 'you', '?']

# Convert tokens to input IDs
input_ids = tokenizer.encode(text)
print("Input IDs from BERT tokenizer:", input_ids)
# Output: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102]

# Tokenize with padding and truncation to a fixed length
print("\nEncoding with padding/truncation to max length of 10...")
encoded = tokenizer.encode_plus(
    text,
    max_length=10,
    padding='max_length',
    truncation=True,
    return_tensors='pt'  # Return PyTorch tensors
)
print("Encoded input with padding and truncation:", encoded)

# Tokenize a batch of texts
texts = ["Hello, how are you?", "I'm fine, thank you."]
print("\nTokenizing a batch of texts...")
encoded_batch = tokenizer(
    texts,
    padding=True,
    truncation=True,
    return_tensors='pt'
)
print("Encoded batch:", encoded_batch)
print("\n")  # For clarity in output

Loading pre-trained BERT tokenizer (bert-base-uncased)...
Original text: 'Hello, how are you?'
Tokens from BERT tokenizer: ['hello', ',', 'how', 'are', 'you', '?']
Input IDs from BERT tokenizer: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102]

Encoding with padding/truncation to max length of 10...
Encoded input with padding and truncation: {'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}

Tokenizing a batch of texts...
Encoded batch: {'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102,    0,    0],
        [ 101, 1045, 1005, 1049, 2986, 1010, 4067, 2017, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


