# Multi-threaded Fast Tokenizer

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import os

# Retrieve the GEMA_TOKEN from environment variables
GEMA_TOKEN = os.getenv("GEMMA_TOKEN")

In [None]:
# pip install accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it", use_auth_token=GEMA_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",
    device_map="auto",
    torch_dtype=torch.float32,
    use_auth_token=GEMA_TOKEN,
)

input_text = "Write me a story about a dragon."
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)

outputs = model.generate(**input_ids, max_new_tokens=300)
print(tokenizer.decode(outputs[0]))

In [None]:
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Check if the tokenizer is fast
print(f"Tokenizer is fast: {tokenizer.is_fast}")

# Function to tokenize
def tokenize(text):
    return tokenizer(text, return_tensors="pt")

# Test multi-threading
texts = ["This is a test sentence."] * 10000000

from itertools import islice

def batchify(iterable, batch_size):
    it = iter(iterable)
    while batch := list(islice(it, batch_size)):
        yield batch

# Tokenize in batches
def process_in_parallel(texts, batch_size, max_workers=4):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        return list(executor.map(tokenize, batchify(texts, batch_size)))


# Tokenize in parallel
process_in_parallel(texts, 64, 14)

# Multi-threading Fast Tokenization & LLM Inference

In [None]:
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from itertools import islice

# Check if MPS is available
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased").to(device)
model.eval()

# Function to tokenize a batch of texts and run inference
def tokenize_and_infer(batch):
    # Tokenize the batch
    tokens = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
    # Run inference
    with torch.no_grad():
        logits = model(**tokens).logits
    # Return predictions
    return logits.argmax(dim=1).tolist()  # Predicted classes

# Batchify texts
def batchify(iterable, batch_size):
    it = iter(iterable)
    while batch := list(islice(it, batch_size)):
        yield batch

# Process texts in parallel (tokenization + inference)
def process_in_parallel(texts, batch_size, max_workers=4):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:  # Adjust threads as needed
        return list(executor.map(tokenize_and_infer, batchify(texts, batch_size)))

# Example texts
texts = ["This is a test sentence."] * 100000

# Process texts
batch_size = 64
max_workers = 14
results = process_in_parallel(texts, batch_size, max_workers=max_workers)

# Display results
print(results[:10])  # Print the first 10 predictions


## Processing Results As Completed

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from itertools import islice

# Check if MPS is available
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased").to(device)
model.eval()

# Function to tokenize a batch of texts and run inference
def tokenize_and_infer(batch):
    # Tokenize the batch
    tokens = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
    # Run inference
    with torch.no_grad():
        logits = model(**tokens).logits
    # Return predictions
    return logits.argmax(dim=1).tolist()  # Predicted classes

# Batchify texts
def batchify(iterable, batch_size):
    it = iter(iterable)
    while batch := list(islice(it, batch_size)):
        yield batch

# Process texts in parallel (tokenization + inference)
def process_in_parallel(texts, batch_size, max_workers=4):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:  # Adjust threads as needed
        # Submit tasks and map batches to futures
        futures = {executor.submit(tokenize_and_infer, batch): batch for batch in batchify(texts, batch_size)}
        
        # Process futures as they complete
        results = []
        for future in as_completed(futures):
            try:
                result = future.result()
                results.extend(result)  # Collect the result
            except Exception as e:
                print(f"Error processing batch: {futures[future]}, {e}")
        return results

# Process texts
batch_size = 64
max_workers = 14
results = process_in_parallel(texts, batch_size, max_workers=max_workers)

# Display results
print(results[:10])  # Print the first 10 predictions

## Adding CPU Usage Monitoring

In [None]:
import psutil
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from itertools import islice
from concurrent.futures import ThreadPoolExecutor, as_completed

# Check if MPS is available
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased").to(device)
model.eval()

# Global list to collect CPU usages
cpu_usages = []

# Function to tokenize a batch of texts and monitor CPU usage
def tokenize_and_infer(batch):
    global cpu_usages
    process = psutil.Process()  # Current process

    # Start monitoring CPU usage with a time gap
    start_time = time.time()
    start_cpu = process.cpu_percent(interval=0.1)  # Short interval for CPU measurement

    # Tokenize the batch
    tokens = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)

    # Stop monitoring CPU usage
    elapsed_time = time.time() - start_time
    end_cpu = process.cpu_percent(interval=0.1)  # Another snapshot after processing

    # Calculate average CPU usage during the interval
    avg_cpu_usage = (start_cpu + end_cpu) / 2
    cpu_usages.append(avg_cpu_usage)  # Collect CPU usage

    print(f"\rTokenization Avg CPU Usage: {avg_cpu_usage:.2f}%, Elapsed Time: {elapsed_time:.4f}s", end='', flush=True)

    # Run inference
    with torch.no_grad():
        logits = model(**tokens).logits

    # Return predictions
    return logits.argmax(dim=1).tolist()  # Predicted classes

# Batchify texts
def batchify(iterable, batch_size):
    it = iter(iterable)
    while batch := list(islice(it, batch_size)):
        yield batch

# Process texts in parallel (tokenization + inference)
def process_in_parallel(texts, batch_size, max_workers=4):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks and map batches to futures
        futures = {executor.submit(tokenize_and_infer, batch): batch for batch in batchify(texts, batch_size)}

        # Process futures as they complete
        results = []
        for future in as_completed(futures):
            try:
                result = future.result()
                results.extend(result)  # Collect the result
            except Exception as e:
                print(f"Error processing batch: {futures[future]}, {e}")
        return results

# Example texts
texts = ["This is a test sentence."] * 100000

# Process texts
batch_size = 64
max_workers = 4
results = process_in_parallel(texts, batch_size, max_workers=max_workers)

# Calculate average CPU usage
average_cpu_usage = sum(cpu_usages) / len(cpu_usages) if cpu_usages else 0
print(f"\nAverage CPU Usage for Tokenization: {average_cpu_usage:.2f}%")

# Display results
print(results[:10])  # Print the first 10 predictions
