In [None]:
!pip install -U pyarrow --quiet
!pip install datasets transformers torch seqeval evaluate  --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.
ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3

Measure inference speed improvement with quantization

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import time
import numpy as np

# Load a pre-trained model and tokenizer
model_name = 'bert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.quantization import quantize_dynamic

# Function to measure inference time
def measure_inference_time(model, inputs, iterations=100):
    model.eval()
    start_time = time.time()
    with torch.no_grad():
        for _ in range(iterations):
            _ = model(**inputs)
    end_time = time.time()
    avg_time = (end_time - start_time) / iterations
    return avg_time

# Prepare input data
sentences = ["This is a test sentence."] * 100
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)

# Measure baseline inference time
baseline_time = measure_inference_time(model, inputs)
print(f"Baseline Model Inference Time: {baseline_time:.6f} seconds per inference")

# Quantize the model for faster inference
quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

# Measure inference time for quantized model
quantized_time = measure_inference_time(quantized_model, inputs)
print(f"Quantized Model Inference Time: {quantized_time:.6f} seconds per inference")

# Calculate performance trade-off
speedup = baseline_time / quantized_time
print(f"Speedup from Quantization: {speedup:.2f}x")


Baseline Model Inference Time: 1.712489 seconds per inference
Quantized Model Inference Time: 1.162237 seconds per inference
Speedup from Quantization: 1.47x


Prune the model and measure speed improvement

In [None]:
from torch.nn.utils import prune

# Function to prune a model
def prune_model(model, amount=0.5):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)
            prune.remove(module, 'weight')
    return model

# Prune the model (remove 50% of the weights)
pruned_model = prune_model(model, amount=0.5)

# Measure inference time for pruned model
pruned_time = measure_inference_time(pruned_model, inputs)
print(f"Pruned Model Inference Time: {pruned_time:.6f} seconds per inference")

# Calculate trade-off
speedup_pruned = baseline_time / pruned_time
print(f"Speedup from Pruning: {speedup_pruned:.2f}x")


Pruned Model Inference Time: 1.422209 seconds per inference
Speedup from Pruning: 1.20x


Measure training time for deeper model vs base model

In [2]:
import torch
from torch import nn
import time
from transformers import BertModel, BertTokenizer, BertConfig
import copy

def move_to_device(model, inputs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
    return model, inputs, device

def create_deeper_model(model, additional_layers=2):
    # Create a deep copy of the model
    deeper_model = copy.deepcopy(model)

    # Get the configuration from the entire model
    config = deeper_model.config

    # Add new layers
    last_layer = deeper_model.encoder.layer[-1]
    for _ in range(additional_layers):
        new_layer = type(last_layer)(config)
        deeper_model.encoder.layer.append(new_layer)

    # Update the config to reflect the new number of layers
    deeper_model.config.num_hidden_layers += additional_layers

    return deeper_model

# Measure training time for baseline and deeper model
def measure_training_time(model, inputs, device, epochs=10):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    # Warm-up run
    for _ in range(5):
        optimizer.zero_grad()
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        dummy_output = last_hidden_state.mean()
        loss = dummy_output.sum()
        loss.backward()
        optimizer.step()

    # Actual timed run
    start_time = time.time()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        dummy_output = last_hidden_state.mean()
        loss = dummy_output.sum()
        loss.backward()
        optimizer.step()
    end_time = time.time()
    total_time = end_time - start_time
    return total_time

# Main execution
if __name__ == "__main__":
    # Initialize the model and tokenizer
    model = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Prepare a larger input
    text = "Hello, how are you? " * 100  # Repeat the text to create a larger input
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

    # Move model and inputs to the appropriate device
    model, inputs, device = move_to_device(model, inputs)

    # Create a deeper model and move it to the device
    deeper_model = create_deeper_model(model, additional_layers=2)
    deeper_model = deeper_model.to(device)

    # Simulate small training loop
    training_time_baseline = measure_training_time(model, inputs, device)
    training_time_deeper = measure_training_time(deeper_model, inputs, device)

    print(f"Baseline Model Training Time: {training_time_baseline:.2f} seconds")
    print(f"Deeper Model Training Time: {training_time_deeper:.2f} seconds")
    print(f"Time Difference: {training_time_deeper - training_time_baseline:.2f} seconds")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Baseline Model Training Time: 1.20 seconds
Deeper Model Training Time: 1.40 seconds
Time Difference: 0.20 seconds


Evaluate memory usage per batch size


In [None]:
import torch

# Function to evaluate model memory usage
def measure_memory_usage(model, inputs, batch_size):
    torch.cuda.empty_cache()

    # Move the model to CUDA if it's not already there
    model = model.to('cuda')

    # Move input tensors to CUDA and adjust batch size
    inputs = {k: v[:batch_size].to('cuda') for k, v in inputs.items()}

    torch.cuda.reset_peak_memory_stats()
    with torch.no_grad():
        _ = model(**inputs)
    memory_used = torch.cuda.max_memory_allocated() / (1024 ** 2)  # Convert to MB
    return memory_used

# Compare memory usage for different batch sizes
batch_sizes = [8, 16, 32, 64]
memory_usage = {}

# Ensure the model is on CUDA before the loop
model = model.to('cuda')

for batch_size in batch_sizes:
    memory_used = measure_memory_usage(model, inputs, batch_size)
    memory_usage[batch_size] = memory_used
    print(f"Memory used for batch size {batch_size}: {memory_used:.2f} MB")

# Output results
print("Batch Size vs. Memory Usage:", memory_usage)

Memory used for batch size 8: 1805.32 MB
Memory used for batch size 16: 1806.94 MB
Memory used for batch size 32: 1811.57 MB
Memory used for batch size 64: 1819.83 MB
Batch Size vs. Memory Usage: {8: 1805.3173828125, 16: 1806.94384765625, 32: 1811.57177734375, 64: 1819.82763671875}
