# HPML Final Project
# Authors: Rafka Daou, Maria Garmonina, Sarah Korb
# Project 4: Exploring Chunked States in Mamba Style Models


The following code blocks handle importing the necessary modules required for the program to run.





In [None]:
# Install the relevant packages for the project.
! pip install -e .

! pip install pynvml rouge_score

! pip install ibm-fms

In [None]:
import torch
import time
import torch.nn.functional as F
import psutil
import pynvml
import pandas as pd
from torch.profiler import profile, ProfilerActivity

from fms.utils.tokenizers import get_tokenizer
from fms.utils.generation import generate

In [None]:
# Mounts the notebook to our google drive to access relevant files.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


The following code is responsible for loading the IBM Bamba module and configuring key parameters, such as the number of layers the model will use. For the purpose of evaluating throughput and latency, we prioritized a configuration with 4 layers while varying prompt lengths.
To address the issue of exploding chunked state strategies, we implemented four distinct module variants: default optimized, optimized, optimized with independent chunking, and optimized diagonal-only.
To specify which SSM (Structured State Space Model) module is used during model execution, we manually set the desired module in the statements below.

In [None]:

import fms.modules.ssm
from fms.modules.default_optimized import SSM as DefaultOptimizedSSM

fms.modules.ssm.SSM = DefaultOptimizedSSM #overwrite the SSM
from fms.models import get_model

model = get_model(
    "hf_configured",
    "ibm-ai-platform/Bamba-9B",
    device_type="cuda",
    data_type=torch.bfloat16,
    nlayers=4, #specify number of layers
)
model.config.attn_layer_indices = []


print("Number of layers:", len(model.base_model.layers))
print("Config nlayers:", model.config.nlayers)
print("Attention layers indices:", model.config.attn_layer_indices)



In [None]:
# confirm model parameters
print(model.config)

BambaConfig(src_vocab_size=128256, emb_dim=4096, nheads=32, kvheads=8, head_dim=64, norm_eps=1e-05, nlayers=4, activation_fn='silu', attn_layer_indices=[], max_expected_seq_len=262144, ntk_scaling=False, tie_heads=False, rope_theta=10000.0, p_dropout=0.0, conv_kernel=4, state_size=128, hidden_grow_factor=3.5, mamba_expand=2, mamba_n_heads=128, multiple_of=256, use_bias=False, use_conv_bias=True, n_groups=1, chunk_size=256, linear_config=None, fused_weights=True)


In [None]:
# confirm which SSM module is running
layer = next(
    block for block in model.base_model.layers
    if hasattr(block, "ssm")
)
print(isinstance(layer.ssm, ChunkedSSM),
      layer.ssm.__class__.__module__ + "." + layer.ssm.__class__.__name__)

False fms.modules.ssm.SSM


## Loading Data


In [None]:
! pip install ibm-fms

In [None]:
import json
from collections import Counter
from fms.utils.tokenizers import get_tokenizer

tokenizer = get_tokenizer("ibm-ai-platform/Bamba-9B")

with open('/content/drive/My Drive/qa_pairs.json', 'r') as f:
    data = json.load(f)

token_lengths = []

for item in data:
    prompt = item[0]
    tokens = tokenizer.tokenize(prompt)
    token_lengths.append(len(tokens))

length_counter = Counter(token_lengths)

sorted_lengths = dict(sorted(length_counter.items()))

print("Prompt Length Statistics:")
for length, count in sorted_lengths.items():
    if length in [256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816,
                  3072, 3328, 3584, 3840, 4096, 4352, 4608, 4864, 5120, 5376,
                  5632, 5888, 6144, 6400, 6656, 6912, 7168, 7424, 7680, 7936,
                  8192] or (length + 1) in [256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816,
                  3072, 3328, 3584, 3840, 4096, 4352, 4608, 4864, 5120, 5376,
                  5632, 5888, 6144, 6400, 6656, 6912, 7168, 7424, 7680, 7936,
                  8192] or (length + 2) in [256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816,
                  3072, 3328, 3584, 3840, 4096, 4352, 4608, 4864, 5120, 5376,
                  5632, 5888, 6144, 6400, 6656, 6912, 7168, 7424, 7680, 7936,
                  8192] or (length + 3) in [256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816,
                  3072, 3328, 3584, 3840, 4096, 4352, 4608, 4864, 5120, 5376,
                  5632, 5888, 6144, 6400, 6656, 6912, 7168, 7424, 7680, 7936,
                  8192] or (length + 4) in [256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816,
                  3072, 3328, 3584, 3840, 4096, 4352, 4608, 4864, 5120, 5376,
                  5632, 5888, 6144, 6400, 6656, 6912, 7168, 7424, 7680, 7936,
                  8192] or (length + 5) in [256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816,
                  3072, 3328, 3584, 3840, 4096, 4352, 4608, 4864, 5120, 5376,
                  5632, 5888, 6144, 6400, 6656, 6912, 7168, 7424, 7680, 7936,
                  8192]:
        print(f"Length {length}: {count} prompts")

In [None]:
import json
from collections import Counter
from fms.utils.tokenizers import get_tokenizer

tokenizer = get_tokenizer("ibm-ai-platform/Bamba-9B")

In [None]:
with open('/content/drive/My Drive/qa_pairs.json', 'r') as f:
    data = json.load(f)

In [None]:
from collections import defaultdict
import math

length_to_items = defaultdict(list)
for item in data:
    token_len = len(tokenizer.tokenize(item[0]))
    length_to_items[token_len].append(item)

In [None]:
target_config = {
    256: {256: 10},
    512: {511: 3, 512: 7},
    768: {766: 1, 767: 3, 768: 6},
    1024: {1023: 4, 1024: 6},
    1280: {1275: 2, 1276: 1, 1277: 1, 1278: 2, 1279: 2, 1280: 2},
    1536: {1532: 1, 1533: 1, 1534: 2, 1535: 3, 1536: 3},
    1792: {1785: 1, 1786: 1, 1788: 3, 1789: 1, 1791: 3, 1792: 1},
    2048: {2044: 2, 2045: 5, 2047: 1, 2048: 2},
    2304: {2291: 1, 2292: 2, 2294: 2, 2296: 1, 2299: 1, 2301: 1, 2302: 1, 2303: 1},
    2560: {2549: 2, 2551: 1, 2553: 1, 2554: 1, 2555: 1, 2556: 2, 2557: 1, 2559: 1},
    2816: {2800: 1, 2801: 2, 2802: 1, 2805: 1, 2806: 1, 2810: 2, 2811: 1, 2814: 1},
    3072: {3053: 1, 3055: 1, 3057: 1, 3060: 1, 3061: 1, 3063: 1, 3065: 1, 3067: 1, 3070: 1, 3071: 1},
    3328: {3302: 1, 3307: 1, 3308: 1, 3310: 2, 3316: 1, 3318: 1, 3323: 2, 3327: 1},
    3584: {3534: 1, 3540: 2, 3541: 1, 3543: 1, 3558: 1, 3560: 1, 3563: 1, 3565: 1, 3571: 1}, ##
    3840: {3750: 1, 3752: 1, 3773: 1, 3786: 1, 3787: 1, 3789: 1, 3790: 1, 3792: 1, 3813: 1, 3824: 1},
    4096: {3969: 1, 3971: 1,3984: 1, 3998: 1, 4014: 1, 4016: 1, 4017: 1, 4040: 1, 4051: 1, 4061: 1},
}

In [None]:
final_data = []
seen_prompts = set()

for target_len, sub_lengths in target_config.items():
    count = 0
    for src_len, take_n in sub_lengths.items():
        examples = length_to_items[src_len]
        if len(examples) < take_n:
            raise ValueError(f"Not enough prompts of length {src_len} (needed {take_n}, got {len(examples)})")
        used = 0
        for item in examples:
            if item[0] not in seen_prompts:
                seen_prompts.add(item[0])
                final_data.append({
                    "prompt": item[0],
                    "answer": item[1],
                    "token_len": src_len,
                    "num_chunks": math.ceil(src_len / 256)
                })
                used += 1
                count += 1
            if used == take_n:
                break
        if used < take_n:
            raise ValueError(f"Could not find enough unique prompts for length {src_len} (needed {take_n}, got {used})")
    assert count == 10, f"Expected 10 prompts for length {target_len}, got {count}"

In [None]:
with open("longer_qa_for_benchmarking.json", "w") as f:
    json.dump(final_data, f, indent=2)

print(f"Saved {len(final_data)} prompts to longer_qa_for_benchmarking.json")

In [None]:
# prompts for accuracy/quality benchmarking

target_config = {
    512: {498: 4, 499: 3, 500: 11, 501: 4, 502: 3, 503: 9, 504: 6, 505: 10, 506: 12, 507: 12, 508: 7, 509: 5, 510: 7, 511: 7}
}


In [None]:
final_data = []
seen_prompts = set()

for target_len, sub_lengths in target_config.items():
    count = 0
    for src_len, take_n in sub_lengths.items():
        examples = length_to_items[src_len]
        if len(examples) < take_n:
            raise ValueError(f"Not enough prompts of length {src_len} (needed {take_n}, got {len(examples)})")
        used = 0
        for item in examples:
            if item[0] not in seen_prompts:
                seen_prompts.add(item[0])
                final_data.append({
                    "prompt": item[0],
                    "answer": item[1],
                    "token_len": src_len,
                    "num_chunks": math.ceil(src_len / 256)
                })
                used += 1
                count += 1
            if used == take_n:
                break
        if used < take_n:
            raise ValueError(f"Could not find enough unique prompts for length {src_len} (needed {take_n}, got {used})")
    assert count == 100, f"Expected 100 prompts for length {target_len}, got {count}"

In [None]:
with open("qa_for_accuracy512.json", "w") as f:
    json.dump(final_data, f, indent=2)

print(f"Saved {len(final_data)} prompts to qa_for_accuracy512.json")

In [None]:
max_len_tok = 0
for item in final_data:
    length_ans = len(tokenizer.tokenize(item['answer']))
    max_len_tok = max(max_len_tok, length_ans)

print(max_len_tok)

## Benchmarking

This code benchmarks the performance of different chunked state strategies for the Bamba model. It processes a list of prompts by tokenizing the input, running the model to generate output, and collecting detailed performance metrics. These include total latency, first-token and inter-token generation times, throughput (tokens per second), peak memory usage, memory bandwidth, CPU/GPU utilization, and total FLOPs. The profiler also logs the most time-consuming CUDA operations. All results are recorded for analysis, allowing comparison across different SSM module variants and prompt configurations.

In [None]:
import json
# updated file containing 16 prompts of lengths [256, 512, 1024, 2048]
with open("/content/drive/MyDrive/HPML/HPML Project/qa_pairs_for_benchmarking.json", "r",encoding="utf-8") as f:
  qa_pairs = json.load(f)

In [None]:

def ids_for_prompt(prompt, tokenizer, device):
    toks = tokenizer.tokenize(prompt)
    ids  = tokenizer.convert_tokens_to_ids(toks)
    return torch.tensor(ids, dtype=torch.long, device=device)

def decode_ids(ids):
    toks  = tokenizer.convert_ids_to_tokens(ids)
    return tokenizer.convert_tokens_to_string(toks)

device = torch.device("cuda")
tokenizer = get_tokenizer("ibm-ai-platform/Bamba-9B")

pynvml.nvmlInit()
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)

model.compile()

records = []
MAX_NEW_TOKENS = 100
log_path = "/content/drive/MyDrive/DefaultOptimizedSSM.txt"

with open(log_path, "a", encoding="utf-8") as log_file:
    for idx, item in enumerate(qa_pairs[:1], start=1): #specify the number of prompts you want to execute on
        inputs = ids_for_prompt(item["prompt"], tokenizer, device)

        # system stats before
        cpu0 = psutil.cpu_percent(None)
        io0  = psutil.cpu_times_percent(None).iowait
        gpu0 = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle).gpu
        torch.cuda.reset_peak_memory_stats()

        # profile the generate step
        with profile(
            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
            record_shapes=True,
            profile_memory=True,
            with_flops=True,
        ) as prof:
            torch.cuda.synchronize()
            t_start = time.time()

            out_ids, times = generate(
                model,
                inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                use_cache=False,
                timing="per-token",
            )

            torch.cuda.synchronize()
            t_end = time.time()

        # system stats after
        cpu1 = psutil.cpu_percent(None)
        io1  = psutil.cpu_times_percent(None).iowait
        gpu1 = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle).gpu
        peak_mem = torch.cuda.max_memory_allocated() / 1024**2

        # derive metrics
        t_first     = times[0]
        t_mean      = sum(times[1:]) / len(times[1:])
        total_time  = t_end - t_start
        throughput  = MAX_NEW_TOKENS / total_time
        mem_bw      = peak_mem / total_time
        total_flops = sum(evt.flops for evt in prof.key_averages() if hasattr(evt, "flops"))
        top_ops     = prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=5)

        rec = {
            "id": idx,
            "total_latency_s": total_time,
            "first_token_s": t_first,
            "mean_inter_token_s": t_mean,
            "throughput_tok_s": throughput,
            "peak_mem_MB": peak_mem,
            "mem_bw_MBps": mem_bw,
            "cpu_start_%": cpu0, "cpu_end_%": cpu1,
            "gpu_start_%": gpu0, "gpu_end_%": gpu1,
            "io_wait_diff_%": io1 - io0,
            "total_flops": total_flops,
            "profiler_top_ops": top_ops,
            "output": decode_ids(out_ids),
            "num_chunks": item["num_chunks"],
            "token_len": item["token_len"]
        }

        log_file.write(f"{rec}\n")
        records.append(rec)

        print(
            f"{idx}/160 | tot={total_time:.3f}s "
            f"| first={t_first:.3f}s | inter={t_mean:.4f}s | thr={throughput:.1f} tok/s"
        )

df = pd.DataFrame(records)
df.to_csv("/content/drive/MyDrive/DefaultOptimizedSSM.csv", index=False)

# Accuarcy Benchmarking

This code performs benchmarking for evaluating the accuracy of different chunked state strategies in the Bamba model. It processes a list of QA pairs by tokenizing the prompts, generating only the predicted answer portion (based on reference answer length), and decoding the generated output. The results—including prompt, reference answer, and model prediction—are logged to a TSV file for later analysis. This setup isolates the model’s generative accuracy, allowing precise comparisons across chunking strategies while controlling for output length.


In [None]:
import csv
import os

device = torch.device("cuda")
tokenizer = get_tokenizer("ibm-ai-platform/Bamba-9B")
model.compile()

def ids_for_prompt(prompt: str, tokenizer, device):
    toks = tokenizer.tokenize(prompt)
    ids = tokenizer.convert_tokens_to_ids(toks)
    # prepend BOS if it's different from EOS
    if tokenizer.bos_token_id != tokenizer.eos_token_id:
        ids = [tokenizer.bos_token_id] + ids
    return torch.tensor(ids, dtype=torch.long, device=device)

def decode_ids(ids: torch.Tensor):
    # ids is a 1D tensor of token ids
    toks = tokenizer.convert_ids_to_tokens(ids.tolist())
    return tokenizer.convert_tokens_to_string(toks)


# CHANGE LOG PATH NAME
log_path = "/content/drive/MyDrive/predictions_chunkingtype.tsv"
write_header = not os.path.exists(log_path)
with open(log_path, "a", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter="\t")
    if write_header:
        writer.writerow(["id", "prompt", "reference", "prediction"])

    for idx, item in enumerate(qa_pairs, start=1):
        prompt = item["prompt"]
        # tokenize + tensorize
        inputs = ids_for_prompt(prompt, tokenizer, device)
        prompt_len = inputs.size(0)

        # generate just the answer tokens
        out_ids = generate(
            model,
            inputs,
            max_new_tokens=len(tokenizer.tokenize(item["answer"])), # only the answer length, (or use 904 on 256 token prompts and 918 on 512 token prompts)
            use_cache=False,
            timing="",
            eos_token_id=tokenizer.eos_token_id,
        )

        # strip the prompt off the front
        new_ids = out_ids[prompt_len:]

        # decode only the new tokens
        output_text = decode_ids(new_ids)
        reference = item["answer"]

        writer.writerow([idx, prompt, reference, output_text])

        print(f"{idx:3d}: {output_text[:60]}…")

## Accuracy Benchmaking

This section prepares the prediction results for accuracy evaluation using GPTScore, a reference-based automatic metric that leverages GPT models to assess the quality of generated text. GPTScore evaluates how well the system’s predicted answers (summaries) align with human-written references. The code loads the predictions TSV, formats each record into the expected JSON schema, and saves it to the directory. Each entry includes the prompt, reference, prediction, and metadata such as system name and evaluation aspect.

In [None]:
# using gpt scorer [after benchmarking]

!git clone https://github.com/jinlanfu/GPTScore.git
%cd GPTScore

Cloning into 'GPTScore'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 50 (delta 13), reused 44 (delta 10), pack-reused 0 (from 0)[K
Receiving objects: 100% (50/50), 851.74 KiB | 10.26 MiB/s, done.
Resolving deltas: 100% (13/13), done.
/content/foundation-model-stack/GPTScore/GPTScore


In [None]:
# then put the predictions tsv into:

import os, json, pandas as pd

# file path needs to be updated
df = pd.read_csv("/content/drive/MyDrive/predictions_chunkingtype.tsv", sep="\t")

# build the JSON structure
records = []
for _, row in df.iterrows():
    records.append({
        "src": row["prompt"],
        "ref_summ": row["reference"],
        "sys_summ": row["prediction"],
        "sys_name": "chunkingtype",
        "aspect": "quality",
        "polarity": "positive"
    })

demo = {
    "demo": {
        "quality": records
    },
    "asp_definition": {
        "quality": "Convert the following text into another expression that is fluent and grammatically correct:"
    }
}

# write it where the CLI expects it
out_dir = "GPTScore/datas/chunkingtype"
os.makedirs(out_dir, exist_ok=True)
with open(os.path.join(out_dir, "demo.json"), "w") as f:
    json.dump(demo, f, indent=2)

In [None]:
! python score_d2t.py \
  --dataname chunkingtype \
  --use_demo False \
  --use_ist False \
  --gpt3_score True \
  --gpt3model curie \
  --out_dir_name results_chunkingtype \
  --aspect quality

## Plotting Benchmarking Results