In [None]:
!pip install -qU codecarbon
!pip install -qU py-cpuinfo
!pip install -qU onnxruntime

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import time
import os
import onnxruntime as ort
import subprocess
from tqdm import tqdm
from transformers import (
    BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel,
    AutoTokenizer, AutoModel
)
import torch.nn as nn
from codecarbon import EmissionsTracker
import psutil
import cpuinfo
import platform

In [None]:
# Constants
MAX_LEN = 150
MODEL_TYPES = ["bert-base-uncased", "distilbert-base-uncased", "google/tinybert-6l-768d"]
MODEL_NAMES = ["BERT", "DistilBERT", "TinyBERT"]
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# Path configurations
ONNX_DIR = "onnx_checkpoints"
PT_DIR = "pt_checkpoints"
RESULTS_DIR = "benchmark_results"
CPP_EXECUTABLE = "onnx_inference_cpp"  # Will be created

# Create results directory if it doesn't exist
os.makedirs(RESULTS_DIR, exist_ok=True)

In [None]:
# Load example data (we'll create synthetic test data for benchmarking)
def generate_test_data(n_samples=100, max_length=150):
    """Generate synthetic test data for benchmarking."""
    # Generate random texts of varying lengths
    texts = []
    for _ in range(n_samples):
        # Random length between 50 and max_length words
        length = np.random.randint(50, max_length)
        # Generate random "words" (we don't need actual words for benchmarking)
        text = " ".join([f"word{i}" for i in range(length)])
        texts.append(text)
    
    return texts

# Model class for PyTorch inference
class BERTClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(BERTClassifier, self).__init__()
        if 'distilbert' in model_name:
            self.bert = DistilBertModel.from_pretrained(model_name)
            self.dropout = nn.Dropout(0.1)
            self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        elif 'tinybert' in model_name:
            self.bert = AutoModel.from_pretrained(model_name)
            self.dropout = nn.Dropout(0.1)
            self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        else:
            self.bert = BertModel.from_pretrained(model_name)
            self.dropout = nn.Dropout(0.1)
            self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        
    def forward(self, input_ids, attention_mask):
        if hasattr(self.bert, 'distilbert'):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.last_hidden_state[:, 0]
        else:
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
        
        x = self.dropout(pooled_output)
        logits = self.classifier(x)
        return logits

# Inference utilities
def get_tokenizer(model_type):
    """Get the appropriate tokenizer for the model type."""
    if 'distilbert' in model_type:
        return DistilBertTokenizer.from_pretrained(model_type)
    elif 'tinybert' in model_type:
        return AutoTokenizer.from_pretrained(model_type)
    else:
        return BertTokenizer.from_pretrained(model_type)

def load_pytorch_model(model_path, model_type, num_labels=100):
    """Load a PyTorch model from a checkpoint."""
    model = BERTClassifier(model_type, num_labels)
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()
    return model

# Inference functions
def pytorch_inference(model, tokenizer, texts, batch_size=1):
    """Run inference using PyTorch."""
    results = []
    
    # Process in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize
        inputs = tokenizer(
            batch_texts,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Inference
        with torch.no_grad():
            outputs = model(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask']
            )
            
        # Convert to predictions (we don't need actual labels for benchmarking)
        predictions = torch.sigmoid(outputs).numpy() > 0.5
        results.extend(predictions.tolist())
    
    return results

def onnx_inference(onnx_path, tokenizer, texts, batch_size=1):
    """Run inference using ONNX Runtime."""
    session = ort.InferenceSession(onnx_path)
    results = []
    
    # Process in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize
        inputs = tokenizer(
            batch_texts,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Get input names
        input_names = [input.name for input in session.get_inputs()]
        
        # Create the input feed
        ort_inputs = {
            input_names[0]: inputs['input_ids'].numpy(),
            input_names[1]: inputs['attention_mask'].numpy()
        }
        
        # Run inference
        outputs = session.run(None, ort_inputs)
        
        # Process outputs
        logits = outputs[0]
        predictions = (1 / (1 + np.exp(-logits))) > 0.5
        results.extend(predictions.tolist())
    
    return results

# Create C++ code for ONNX inference
def create_cpp_inference_code():
    """Create a C++ file for ONNX inference."""
    cpp_code = """
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <chrono>
#include <onnxruntime_cxx_api.h>

struct InferenceResult {
    std::vector<std::vector<bool>> predictions;
    double inference_time_ms;
};

InferenceResult run_inference(const std::string& model_path, 
                             const std::vector<std::vector<int64_t>>& input_ids,
                             const std::vector<std::vector<int64_t>>& attention_mask) {
    // Initialize ONNX Runtime
    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "ONNXInference");
    Ort::SessionOptions session_options;
    session_options.SetIntraOpNumThreads(1);
    session_options.SetInterOpNumThreads(1);
    session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);

    // Create session
    Ort::Session session(env, model_path.c_str(), session_options);

    // Get input and output names
    Ort::AllocatorWithDefaultOptions allocator;
    std::vector<const char*> input_names = {"input_ids", "attention_mask"};
    std::vector<const char*> output_names = {"output"};

    // Prepare input tensors
    std::vector<int64_t> input_dims = {static_cast<int64_t>(input_ids.size()), static_cast<int64_t>(input_ids[0].size())};
    
    std::vector<int64_t> input_ids_flattened;
    std::vector<int64_t> attention_mask_flattened;
    
    for (const auto& ids : input_ids) {
        input_ids_flattened.insert(input_ids_flattened.end(), ids.begin(), ids.end());
    }
    
    for (const auto& mask : attention_mask) {
        attention_mask_flattened.insert(attention_mask_flattened.end(), mask.begin(), mask.end());
    }

    Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
    
    std::vector<Ort::Value> input_tensors;
    input_tensors.push_back(Ort::Value::CreateTensor<int64_t>(
        memory_info, input_ids_flattened.data(), input_ids_flattened.size(), 
        input_dims.data(), input_dims.size()));
    
    input_tensors.push_back(Ort::Value::CreateTensor<int64_t>(
        memory_info, attention_mask_flattened.data(), attention_mask_flattened.size(), 
        input_dims.data(), input_dims.size()));

    // Run inference
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<Ort::Value> output_tensors = session.Run(
        Ort::RunOptions{nullptr}, 
        input_names.data(), 
        input_tensors.data(), 
        input_tensors.size(), 
        output_names.data(), 
        output_names.size());
    auto end = std::chrono::high_resolution_clock::now();
    
    // Calculate inference time
    std::chrono::duration<double, std::milli> inference_time = end - start;

    // Process outputs
    float* output_data = output_tensors[0].GetTensorMutableData<float>();
    
    int64_t batch_size = input_dims[0];
    int64_t num_labels = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape()[1];
    
    // Convert logits to predictions
    std::vector<std::vector<bool>> predictions(batch_size, std::vector<bool>(num_labels));
    for (int64_t i = 0; i < batch_size; i++) {
        for (int64_t j = 0; j < num_labels; j++) {
            float logit = output_data[i * num_labels + j];
            float sigmoid = 1.0f / (1.0f + exp(-logit));
            predictions[i][j] = sigmoid > 0.5f;
        }
    }
    
    return {predictions, inference_time.count()};
}

int main(int argc, char* argv[]) {
    if (argc < 2) {
        std::cerr << "Usage: " << argv[0] << " <model_path> <input_file> <output_file>" << std::endl;
        return 1;
    }
    
    std::string model_path = argv[1];
    std::string input_file = argv[2];
    std::string output_file = argv[3];
    
    // Read input data from file
    std::ifstream input(input_file);
    if (!input.is_open()) {
        std::cerr << "Could not open input file: " << input_file << std::endl;
        return 1;
    }
    
    std::vector<std::vector<int64_t>> input_ids;
    std::vector<std::vector<int64_t>> attention_mask;
    
    size_t batch_size, seq_len;
    input >> batch_size >> seq_len;
    
    for (size_t i = 0; i < batch_size; i++) {
        std::vector<int64_t> ids(seq_len);
        for (size_t j = 0; j < seq_len; j++) {
            input >> ids[j];
        }
        input_ids.push_back(ids);
    }
    
    for (size_t i = 0; i < batch_size; i++) {
        std::vector<int64_t> mask(seq_len);
        for (size_t j = 0; j < seq_len; j++) {
            input >> mask[j];
        }
        attention_mask.push_back(mask);
    }
    
    input.close();
    
    // Run inference
    InferenceResult result = run_inference(model_path, input_ids, attention_mask);
    
    // Write results to output file
    std::ofstream output(output_file);
    if (!output.is_open()) {
        std::cerr << "Could not open output file: " << output_file << std::endl;
        return 1;
    }
    
    output << result.inference_time_ms << std::endl;
    
    for (const auto& batch_preds : result.predictions) {
        for (const auto& pred : batch_preds) {
            output << (pred ? 1 : 0) << " ";
        }
        output << std::endl;
    }
    
    output.close();
    
    return 0;
}
    """
    
    # Write to file
    with open("onnx_inference.cpp", "w") as f:
        f.write(cpp_code)
    
    # Create CMakeLists.txt
    cmake_file = """
cmake_minimum_required(VERSION 3.10)
project(ONNXInference)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Find ONNX Runtime
find_package(onnxruntime REQUIRED)

add_executable(onnx_inference_cpp onnx_inference.cpp)
target_link_libraries(onnx_inference_cpp onnxruntime)
    """
    
    with open("CMakeLists.txt", "w") as f:
        f.write(cmake_file)
    
    print("C++ code and CMakeLists.txt created.")
    print("To compile, you'll need to install ONNX Runtime for C++ and run:")
    print("mkdir build && cd build && cmake .. && make")

def prepare_cpp_input_data(tokenizer, texts, filename):
    """Prepare input data for C++ inference."""
    inputs = tokenizer(
        texts,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = inputs['input_ids'].numpy()
    attention_mask = inputs['attention_mask'].numpy()
    
    batch_size, seq_len = input_ids.shape
    
    with open(filename, 'w') as f:
        f.write(f"{batch_size} {seq_len}\n")
        
        # Write input_ids
        for i in range(batch_size):
            for j in range(seq_len):
                f.write(f"{input_ids[i][j]} ")
            f.write("\n")
        
        # Write attention_mask
        for i in range(batch_size):
            for j in range(seq_len):
                f.write(f"{attention_mask[i][j]} ")
            f.write("\n")
    
    return batch_size, seq_len

def run_cpp_inference(model_path, input_file, output_file):
    """Run inference using the compiled C++ executable."""
    cmd = [f"./{CPP_EXECUTABLE}", model_path, input_file, output_file]
    
    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        
        # Read results
        with open(output_file, 'r') as f:
            inference_time = float(f.readline().strip())
            
            predictions = []
            for line in f:
                if line.strip():
                    preds = [bool(int(p)) for p in line.strip().split()]
                    predictions.append(preds)
        
        return predictions, inference_time
    except subprocess.CalledProcessError as e:
        print(f"Error running C++ inference: {e}")
        print(f"stdout: {e.stdout}")
        print(f"stderr: {e.stderr}")
        return None, None

# System info utilities
def get_system_info():
    """Get system information for reporting."""
    cpu_info = cpuinfo.get_cpu_info()
    return {
        "os": platform.platform(),
        "cpu": cpu_info.get('brand_raw', 'Unknown CPU'),
        "python_version": platform.python_version(),
        "pytorch_version": torch.__version__,
        "onnxruntime_version": ort.__version__,
        "ram": f"{psutil.virtual_memory().total / (1024**3):.2f} GB"
    }

# Benchmarking functions
def benchmark_model(model_name, model_type, texts, 
                   pt_quantized=False, onnx_quantized=False, 
                   batch_sizes=[1, 4, 16], cpp_enabled=True):
    """Benchmark a model using both PyTorch and ONNX Runtime."""
    results = {
        "model_name": model_name,
        "quantized": pt_quantized or onnx_quantized,
        "batch_sizes": batch_sizes,
        "pytorch": [],
        "onnx": [],
        "cpp": []
    }
    
    # Get the right paths
    pt_suffix = "-Quantized" if pt_quantized else ""
    onnx_suffix = "-Quantized" if onnx_quantized else ""
    
    pt_path = os.path.join(PT_DIR, f"{model_name}{pt_suffix}_model.pt")
    onnx_path = os.path.join(ONNX_DIR, f"{model_name}{onnx_suffix}_model.onnx")
    
    # Get tokenizer
    tokenizer = get_tokenizer(model_type)
    
    # PyTorch inference
    if os.path.exists(pt_path):
        model = load_pytorch_model(pt_path, model_type)
        
        for batch_size in batch_sizes:
            print(f"Benchmarking {model_name}{pt_suffix} (PyTorch) with batch size {batch_size}...")
            
            # Track inference time
            start_time = time.time()
            
            # Track emissions
            tracker = EmissionsTracker(
                project_name=f"{model_name}-PT-bs{batch_size}",
                output_dir=RESULTS_DIR,
                measure_power_secs=1,
                save_to_file=False
            )
            tracker.start()
            
            # Run inference multiple times for better measurement
            for _ in tqdm(range(10)):
                _ = pytorch_inference(model, tokenizer, texts, batch_size=batch_size)
            
            emissions = tracker.stop()
            end_time = time.time()
            
            # Get memory usage
            memory_usage = psutil.Process().memory_info().rss / (1024 * 1024)  # in MB
            
            results["pytorch"].append({
                "batch_size": batch_size,
                "inference_time": (end_time - start_time) / 10,  # average time
                "emissions": emissions,
                "memory_usage_mb": memory_usage
            })
    else:
        print(f"Warning: PyTorch model file {pt_path} not found.")
    
    # ONNX inference
    if os.path.exists(onnx_path):
        for batch_size in batch_sizes:
            print(f"Benchmarking {model_name}{onnx_suffix} (ONNX) with batch size {batch_size}...")
            
            # Track inference time
            start_time = time.time()
            
            # Track emissions
            tracker = EmissionsTracker(
                project_name=f"{model_name}-ONNX-bs{batch_size}",
                output_dir=RESULTS_DIR,
                measure_power_secs=1,
                save_to_file=False
            )
            tracker.start()
            
            # Run inference multiple times for better measurement
            for _ in tqdm(range(10)):
                _ = onnx_inference(onnx_path, tokenizer, texts, batch_size=batch_size)
            
            emissions = tracker.stop()
            end_time = time.time()
            
            # Get memory usage
            memory_usage = psutil.Process().memory_info().rss / (1024 * 1024)  # in MB
            
            results["onnx"].append({
                "batch_size": batch_size,
                "inference_time": (end_time - start_time) / 10,  # average time
                "emissions": emissions,
                "memory_usage_mb": memory_usage
            })
    else:
        print(f"Warning: ONNX model file {onnx_path} not found.")
    
    # C++ inference (if enabled and compiled)
    if cpp_enabled and os.path.exists(CPP_EXECUTABLE) and os.path.exists(onnx_path):
        for batch_size in batch_sizes:
            print(f"Benchmarking {model_name}{onnx_suffix} (C++) with batch size {batch_size}...")
            
            # Prepare input data for C++
            cpp_input_file = os.path.join(RESULTS_DIR, f"{model_name}_cpp_input.txt")
            cpp_output_file = os.path.join(RESULTS_DIR, f"{model_name}_cpp_output.txt")
            
            # We'll use a subset of texts for C++ inference to match the batch size
            subset_texts = texts[:batch_size]
            prepare_cpp_input_data(tokenizer, subset_texts, cpp_input_file)
            
            # Track emissions
            tracker = EmissionsTracker(
                project_name=f"{model_name}-CPP-bs{batch_size}",
                output_dir=RESULTS_DIR,
                measure_power_secs=1,
                save_to_file=False
            )
            tracker.start()
            
            # Run inference multiple times for better measurement
            all_times = []
            for _ in tqdm(range(10)):
                _, inference_time = run_cpp_inference(onnx_path, cpp_input_file, cpp_output_file)
                if inference_time is not None:
                    all_times.append(inference_time)
            
            emissions = tracker.stop()
            
            if all_times:
                avg_time = sum(all_times) / len(all_times)
                
                # Get memory usage (this will be for the Python process, not the C++ executable)
                memory_usage = psutil.Process().memory_info().rss / (1024 * 1024)  # in MB
                
                results["cpp"].append({
                    "batch_size": batch_size,
                    "inference_time": avg_time / 1000,  # convert ms to seconds
                    "emissions": emissions,
                    "memory_usage_mb": memory_usage
                })
    
    return results

# Visualization functions
def plot_inference_time_comparison(all_results, save_path=None):
    """Plot inference time comparison across models and frameworks."""
    plt.figure(figsize=(14, 8))
    
    # Prepare data
    model_names = []
    frameworks = ["PyTorch", "ONNX", "C++"]
    times_by_framework = {fw: [] for fw in frameworks}
    batch_size_to_use = 1  # We'll compare using batch size 1
    
    for result in all_results:
        model_name = result["model_name"]
        if result["quantized"]:
            model_name += " (Q)"
        model_names.append(model_name)
        
        # Get inference times for each framework
        for fw_key, fw_name in zip(["pytorch", "onnx", "cpp"], frameworks):
            fw_results = result[fw_key]
            if fw_results:
                # Find the batch size we want
                batch_result = next((r for r in fw_results if r["batch_size"] == batch_size_to_use), None)
                if batch_result:
                    times_by_framework[fw_name].append(batch_result["inference_time"])
                else:
                    times_by_framework[fw_name].append(None)
            else:
                times_by_framework[fw_name].append(None)
    
    # Plot
    x = np.arange(len(model_names))
    width = 0.25
    
    for i, (fw_name, times) in enumerate(times_by_framework.items()):
        # Filter out None values
        valid_indices = [j for j, t in enumerate(times) if t is not None]
        valid_model_names = [model_names[j] for j in valid_indices]
        valid_times = [times[j] for j in valid_indices]
        
        if valid_times:
            plt.bar(
                x[valid_indices] + (i - 1) * width, 
                valid_times, 
                width, 
                label=fw_name
            )
    
    plt.xlabel('Model')
    plt.ylabel('Inference Time (seconds)')
    plt.title('Inference Time Comparison (Batch Size = 1)')
    plt.xticks(x, model_names, rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
    
    plt.show()

def plot_emissions_comparison(all_results, save_path=None):
    """Plot carbon emissions comparison across models and frameworks."""
    plt.figure(figsize=(14, 8))
    
    # Prepare data
    model_names = []
    frameworks = ["PyTorch", "ONNX", "C++"]
    emissions_by_framework = {fw: [] for fw in frameworks}
    batch_size_to_use = 1  # We'll compare using batch size 1
    
    for result in all_results:
        model_name = result["model_name"]
        if result["quantized"]:
            model_name += " (Q)"
        model_names.append(model_name)
        
        # Get emissions for each framework
        for fw_key, fw_name in zip(["pytorch", "onnx", "cpp"], frameworks):
            fw_results = result[fw_key]
            if fw_results:
                # Find the batch size we want
                batch_result = next((r for r in fw_results if r["batch_size"] == batch_size_to_use), None)
                if batch_result:
                    emissions_by_framework[fw_name].append(batch_result["emissions"])
                else:
                    emissions_by_framework[fw_name].append(None)
            else:
                emissions_by_framework[fw_name].append(None)
    
    # Plot
    x = np.arange(len(model_names))
    width = 0.25
    
    for i, (fw_name, emissions) in enumerate(emissions_by_framework.items()):
        # Filter out None values
        valid_indices = [j for j, e in enumerate(emissions) if e is not None]
        valid_model_names = [model_names[j] for j in valid_indices]
        valid_emissions = [emissions[j] for j in valid_indices]
        
        if valid_emissions:
            plt.bar(
                x[valid_indices] + (i - 1) * width, 
                valid_emissions, 
                width, 
                label=fw_name
            )
    
    plt.xlabel('Model')
    plt.ylabel('Carbon Emissions (kg CO2eq)')
    plt.title('Carbon Emissions Comparison (Batch Size = 1)')
    plt.xticks(x, model_names, rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
    
    plt.show()

def plot_efficiency_comparison(all_results, save_path=None):
    """Plot efficiency (inferences per kWh) comparison."""
    plt.figure(figsize=(14, 8))
    
    # Prepare data
    model_names = []
    frameworks = ["PyTorch", "ONNX", "C++"]
    efficiency_by_framework = {fw: [] for fw in frameworks}
    batch_size_to_use = 1  # We'll compare using batch size 1
    
    for result in all_results:
        model_name = result["model_name"]
        if result["quantized"]:
            model_name += " (Q)"
        model_names.append(model_name)
        
        # Calculate efficiency for each framework
        for fw_key, fw_name in zip(["pytorch", "onnx", "cpp"], frameworks):
            fw_results = result[fw_key]
            if fw_results:
                # Find the batch size we want
                batch_result = next((r for r in fw_results if r["batch_size"] == batch_size_to_use), None)
                if batch_result and batch_result["emissions"] > 0:
                    # Calculate inferences per kg CO2eq (higher is better)
                    # Assuming 10 inferences in our benchmark
                    efficiency = 10 / batch_result["emissions"]
                    efficiency_by_framework[fw_name].append(efficiency)
                else:
                    efficiency_by_framework[fw_name].append(None)
            else:
                efficiency_by_framework[fw_name].append(None)
    
    # Plot
    x = np.arange(len(model_names))
    width = 0.25
    
    for i, (fw_name, efficiencies) in enumerate(efficiency_by_framework.items()):
        # Filter out None values
        valid_indices = [j for j, e in enumerate(efficiencies) if e is not None]
        valid_model_names = [model_names[j] for j in valid_indices]
        valid_efficiencies = [efficiencies[j] for j in valid_indices]
        
        if valid_efficiencies:
            plt.bar(
                x[valid_indices] + (i - 1) * width, 
                valid_efficiencies, 
                width, 
                label=fw_name
            )
    
    plt.xlabel('Model')
    plt.ylabel('Inferences per kg CO2eq')
    plt.title('Energy Efficiency Comparison (Batch Size = 1)')
    plt.xticks(x, model_names, rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
    
    plt.show()

def plot_batch_size_impact(all_results, metric='inference_time', save_path=None):
    """Plot the impact of batch size on inference time or emissions."""
    plt.figure(figsize=(14, 8))
    
    # Set up the plot
    if metric == 'inference_time':
        plt.ylabel('