In [None]:
from transformers import pipeline

# Basic sentiment analysis
classifier = pipeline("sentiment-analysis")
result = classifier("I love working with transformers!")



In [None]:
generator = pipeline("text-generation")
text = generator("The future of AI is")
print(text)

In [None]:
# Text generation with specific parameters
generator = pipeline(
    "text-generation",
    model="gpt2",
    max_length=50,
    temperature=0.7,
    top_k=50
)
text = generator("The future of AI is")
print(text)

In [None]:
from transformers import AutoTokenizer

# Load BERT tokenizer which uses WordPiece
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Let's use a sentence with some interesting words to tokenize
text = "The ultramarathoner prequalified for the immunohistochemistry conference in neuroscience."

# Tokenize the text
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# Get the token IDs
token_ids = tokenizer.encode(text)
print("\nToken IDs:", token_ids)

# Decode back to show special tokens
decoded = tokenizer.decode(token_ids)
print("\nDecoded with special tokens:", decoded)

# Let's see the mapping between tokens and their IDs
print("\nToken to ID mapping:")
for token in tokens:
    # Convert single token to ID
    id = tokenizer.convert_tokens_to_ids(token)
    print(f"{token:20} -> {id}")

In [None]:
text2 = "The preprocessing workflow using tokenizers and tokenizers123 also has preprocessingABC and post-processing!"

tokens2 = tokenizer.tokenize(text2)
print("Tokens:", tokens2)

In [None]:
from transformers import AutoTokenizer

# Load GPT-2 tokenizer which uses BPE
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# Same sentence as before
text = "The ultramarathoner prequalified for the immunohistochemistry conference in neuroscience."

# Tokenize the text
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# Get the token IDs
token_ids = tokenizer.encode(text)
print("\nToken IDs:", token_ids)

# Decode back to show special tokens
decoded = tokenizer.decode(token_ids)
print("\nDecoded with special tokens:", decoded)

# Show the mapping between tokens and their IDs
print("\nToken to ID mapping:")
for token in tokens:
    id = tokenizer.convert_tokens_to_ids(token)
    print(f"{repr(token):20} -> {id}")  # Using repr() to show the special characters

# Let's also try a word with numbers and special characters
text2 = "In 2024, pre-processing costs $123.45!"
tokens2 = tokenizer.tokenize(text2)
print("\nExample 2 tokens:", tokens2)

In [None]:
from transformers import AutoTokenizer

# Load T5 tokenizer which uses SentencePiece
tokenizer = AutoTokenizer.from_pretrained('t5-base')

# Same sentence as before
text = "The ultramarathoner prequalified for the immunohistochemistry conference in neuroscience."

# Tokenize the text
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# Get the token IDs
token_ids = tokenizer.encode(text)
print("\nToken IDs:", token_ids)

# Decode back to show special tokens
decoded = tokenizer.decode(token_ids)
print("\nDecoded with special tokens:", decoded)

# Show the mapping between tokens and their IDs
print("\nToken to ID mapping:")
for token in tokens:
    id = tokenizer.convert_tokens_to_ids(token)
    print(f"{repr(token):20} -> {id}")

# Let's also try a multilingual example with mixed scripts
text2 = "Tokyo 東京 is beautiful! Pre-processing in 2024 costs $123.45"
tokens2 = tokenizer.tokenize(text2)
print("\nMultilingual example tokens:", tokens2)

# Let's also look at how it handles whitespace and punctuation
text3 = "  Hello,  world!  "  # Extra spaces
tokens3 = tokenizer.tokenize(text3)
print("\nWhitespace handling example:", tokens3)