In [None]:
access_token = "YOUR ACCESS TOKEN HERE"

In [None]:
# Requirement
!pip install git+https://github.com/huggingface/transformers@main
!pip install causal-conv1d>=1.2.0
!pip install mamba-ssm

Collecting git+https://github.com/huggingface/transformers@main
  Cloning https://github.com/huggingface/transformers (to revision main) to /tmp/pip-req-build-b5b4to2o
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-b5b4to2o
  Resolved https://github.com/huggingface/transformers to commit b32bf85b58260f05da7f3623dca722f9780d2cbc
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


Importing necessary library

In [None]:
import torch
import torch.nn.functional as F
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import time
import sys
import gc
import pandas as pd

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

Scaling parameter Benchmark


In [None]:
# def calculate_perplexity(model, tokenizer, dataloader):
#   model.eval()
#   total_loss = 0
#   with torch.no_grad():
#     for batch in dataloader:
#       input_ids, label = batch["input_ids"].to(device), batch["labels"]
#       outputs = model(input_ids, labels=labels)
#       total_loss += outputs.loss.item()
#   average_loss = total_loss / len(dataloader)
#   perplexity = torch.exp(torch.tensor(average_loss))
#   return perplexity.item()


In [None]:
def measure_time_and_memory(model, tokenizer, prompt, max_new_tokens):
    # Put on GPU
    model.to(device)

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    # Clear any cached memory
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.synchronize() # Wait for all kernels to complette

    start_mem = torch.cuda.memory_allocated(device) # Get starting memory usage
    start_peak_mem = torch.cuda.max_memory_allocated(device) # Get peak memory usage

    start_time = time.time()

    with torch.no_grad(): # Temporarily disable gradient to save memory
        outputs = model.generate(input_ids, max_new_tokens=max_new_tokens)

    end_time = time.time()

    torch.cuda.synchronize() # Makesure that all CUDA operations have completed
    end_mem = torch.cuda.memory_allocated(device) # Get ending memory usage
    end_peak_mem = torch.cuda.max_memory_allocated(device) # Get ending peak memory usage

    time_taken = end_time - start_time
    memory_used = end_mem - start_mem
    peak_memory_used = end_peak_mem - start_peak_mem
    output_length = outputs.shape[1] if outputs is not None else 0

    return time_taken, memory_used, peak_memory_used, output_length


In [None]:
def run_benchmarks(prompts, max_new_tokens_list, model_info):
    results = []

    for prompt in prompts:
        input_length = len(prompt)
        for max_new_tokens in max_new_tokens_list:
            for model_name, details in model_info.items():
                tokenizer = details["tokenizer"]
                model = details["model"].to(device)

                time_taken, memory_used, peak_memory_used, output_length = measure_time_and_memory(model, tokenizer, prompt, max_new_tokens)

                # deleting the model and clearing cache
                del model
                gc.collect()
                torch.cuda.empty_cache()

                results.append({
                    "GPU": "V100 High RAM",
                    "Model": model_name,
                    "Prompt Length": input_length,
                    "Max New Tokens": max_new_tokens,
                    "Time Taken (s)": time_taken,
                    "Memory Used (MiB)": memory_used / (1024**2),
                    "Peak Memory Used (MiB)": peak_memory_used / (1024**2),
                    "Output Length": output_length,
                })


    df = pd.DataFrame(results)
    df.to_csv("benchmark_results.csv", index=False)
    print("Benchmark results saved to benchmark_results.csv")
    return df

In [None]:
model_info = {
    # "Mamba 130M": {
    #     "tokenizer": AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf"),
    #     "model": AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
    # },
    # "Mamba 370M": {
    #     "tokenizer": AutoTokenizer.from_pretrained("state-spaces/mamba-370m-hf"),
    #     "model": AutoModelForCausalLM.from_pretrained("state-spaces/mamba-370m-hf"),
    # },
    # "Mamba 1.4B": {
    #     "tokenizer": AutoTokenizer.from_pretrained("state-spaces/mamba-1.4b-hf"),
    #     "model": AutoModelForCausalLM.from_pretrained("state-spaces/mamba-1.4b-hf"),
    # },
    "Mamba 2.8B": {
        "tokenizer": AutoTokenizer.from_pretrained("state-spaces/mamba-2.8b-hf"),
        "model": AutoModelForCausalLM.from_pretrained("state-spaces/mamba-2.8b-hf"),
    },
    # "Gemma 2B": {
    #     "tokenizer": AutoTokenizer.from_pretrained("google/gemma-2b", token=access_token),
    #     "model": AutoModelForCausalLM.from_pretrained("google/gemma-2b", token=access_token),
    # },
      # "Phi 2.7B": {
      #   "tokenizer": AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True),
      #   "model": AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True),
      # }
    }


for info in model_info.values():
  info["model"] = info["model"].to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
prompts = [
    "Explain the concept of blockchain to a 10-year-old.",
    "Describe a day in the life of an astronaut on Mars in 2040.",
    "Imagine a society where human emotions can be transferred as data. How would this technological advancement affect interpersonal relationships, art, and mental health?",
    "Explain the potential impacts of quantum computing on global cybersecurity infrastructure. Consider both the advancements in cryptography and the potential vulnerabilities introduced",
]

max_new_tokens_list = [50, 100, 150, 200]

df = run_benchmarks(prompts, max_new_tokens_list, model_info)

print(df)

Benchmark results saved to benchmark_results.csv
              GPU       Model  Prompt Length  Max New Tokens  Time Taken (s)  \
0   V100 High RAM  Mamba 2.8B             51              50        6.721991   
1   V100 High RAM  Mamba 2.8B             51             100        5.748982   
2   V100 High RAM  Mamba 2.8B             51             150        8.839046   
3   V100 High RAM  Mamba 2.8B             51             200       11.703989   
4   V100 High RAM  Mamba 2.8B             59              50        2.836045   
5   V100 High RAM  Mamba 2.8B             59             100        5.807149   
6   V100 High RAM  Mamba 2.8B             59             150        8.686378   
7   V100 High RAM  Mamba 2.8B             59             200       12.032186   
8   V100 High RAM  Mamba 2.8B            167              50        2.783913   
9   V100 High RAM  Mamba 2.8B            167             100        5.947925   
10  V100 High RAM  Mamba 2.8B            167             150        8.7