In [None]:
from datasets import load_dataset

# List of QA benchmark subsets in LongBench
qa_datasets = [
    "narrativeqa",
    "qasper",
    "multifieldqa_en",
    "multifieldqa_zh",
    "hotpotqa",
    "2wikimqa",
    "musique",
    "dureader",
    "triviaqa",
]

qa_splits = {}
for name in qa_datasets:
    # Load only the 'test' split of each QA subset
    qa_splits[name] = load_dataset("THUDM/LongBench", name, split="test")


In [None]:
# Example: inspect the first example of each
for name, ds in qa_splits.items():
    print(f"{name} → {len(ds)} examples, first record keys: {list(ds[0].keys())}")

In [None]:
qa_splits["narrativeqa"][0]  # Example to show the structure of the first recor

In [None]:
from collections import defaultdict
import csv
import os

# 3) Define bucket ranges (token-length) for ~2k, 4k, 8k, 16k, 32k
bucket_specs = [
    ("3k",  0,      3072),
    ("4k",  3072,   4096),
    ("8k",  4097,   8192),
    ("16k", 8193,   16384),
    ("32k", 16385,  32768)
]

def assign_bucket(length: int) -> str:
    for label, lo, hi in bucket_specs:
        if lo <= length <= hi:
            return label
    return ""

# 4) Collect up to 1000 English examples per bucket
buckets = defaultdict(list)
MAX_PER_BUCKET = 100

for ds_name, ds in qa_splits.items():
    for example in ds:
        # Only keep English examples
        if example.get("language") != "en":
            continue

        length = example.get("length", None)
        if length is None:
            continue
        
        bucket_label = assign_bucket(length)
        if not bucket_label:
            continue
        
        if len(buckets[bucket_label]) >= MAX_PER_BUCKET:
            continue
        
        context  = example["context"]
        question = example["input"]
        raw_answers = example.get("answers", [])
        answer = raw_answers[0] if isinstance(raw_answers, list) and raw_answers else raw_answers
        
        buckets[bucket_label].append({
            "context":  context,
            "question": question,
            "answer":   answer,
            "length":   length,
            "dataset":  ds_name
        })
        
        # Stop if all buckets have 1000 examples
        if all(len(buckets[label]) >= MAX_PER_BUCKET for label, _, _ in bucket_specs):
            break
    else:
        continue
    break

# 5) Verify counts
for label, _, _ in bucket_specs:
    print(f"Bucket '{label}' has {len(buckets[label])} English examples (target: {MAX_PER_BUCKET})")


# 6) Write all buckets to a single CSV file with 'context_range' column

# Ensure output directory exists
os.makedirs("bucketed_qas_csv", exist_ok=True)

# Define CSV fieldnames (including new 'context_range' column)
fieldnames = ["context", "question", "answer", "length", "dataset", "context_range"]

# Collect all rows from each bucket into a single list
all_rows = []
for label, _, _ in bucket_specs:
    for ex in buckets[label]:
        all_rows.append({
            "context":       ex["context"],
            "question":      ex["question"],
            "answer":        ex["answer"],
            "length":        ex["length"],
            "dataset":       ex["dataset"],
            "context_range": label
        })

# Write them all to one CSV
out_path = "bucketed_qas_csv/longbench_all_buckets_100.csv"
with open(out_path, "w", encoding="utf-8", newline="") as fout:
    writer = csv.DictWriter(fout, fieldnames=fieldnames)
    writer.writeheader()
    for row in all_rows:
        writer.writerow(row)

print(f"Done. Saved merged CSV with 'context_range' column to '{out_path}'.")


In [None]:
import pandas as pd

data = pd.read_csv("/home/ubuntu/fast_llm_inference/long_context_data/longbench_all_buckets_100.csv")

In [None]:
data.loc[499]['question']

In [None]:
data.loc[499]['answer']

In [None]:
from huggingface_hub import Repository
import shutil
import os

# 1) Define local paths and HF repo
LOCAL_CSV_PATH = "/home/ubuntu/fast_llm_inference/long_context_data/longbench_all_buckets_100.csv"
HF_REPO_ID     = "slinusc/qa_increasing_context_length"

# 2) Clone the existing dataset repo into a temporary folder
LOCAL_REPO_DIR = "./hf_qa_dataset_repo"
if os.path.isdir(LOCAL_REPO_DIR):
    shutil.rmtree(LOCAL_REPO_DIR)

repo = Repository(
    local_dir=LOCAL_REPO_DIR,
    clone_from=HF_REPO_ID,
    repo_type="dataset"
)

# 3) Copy your CSV into the cloned repo directory
dest_path = os.path.join(LOCAL_REPO_DIR, "longbench_all_buckets_100.csv")
shutil.copyfile(LOCAL_CSV_PATH, dest_path)

# 4) Create or update a minimal README.md (mentioning 'context_range' instead of 'bucket')
readme_path = os.path.join(LOCAL_REPO_DIR, "README.md")
with open(readme_path, "w", encoding="utf-8") as f:
    f.write(
        "# QA Increasing Context Length\n\n"
        "This dataset contains a single CSV (`longbench_all_buckets_100.csv`) with QA examples\n"
        "bucketed by context length (2k, 4k, 8k, 16k, 32k). Each row has:\n"
        "- `context`\n"
        "- `question`\n"
        "- `answer`\n"
        "- `length` (token count)\n"
        "- `dataset` (LongBench subset)\n"
        "- `context_range` (2k/4k/8k/16k/32k)\n"
    )

# 5) Add, commit, and push changes to the HF Hub
repo.git_add(auto_lfs_track=True)
repo.git_commit("Update README to use 'context_range' and add longbench_all_buckets_100.csv")
repo.git_push()

print("Successfully pushed updated CSV and README to:", HF_REPO_ID)

In [None]:
from datasets import load_dataset

# Replace "your-username/longbench-qa-increasing-context" 
# with the actual dataset ID on huggingface.co/datasets
ds = load_dataset("slinusc/ContextStretchQA")

In [None]:
ds['train'].features

In [None]:
pd.DataFrame(ds["train"])

In [None]:
from benchmark.tasks.qa import QATask

# Create a QATask instance using the loaded dataset
qa_task = QATask()

qa_task.generate_prompts(10)[0]

In [None]:
from benchmark.tasks.long_context import LongContextQATask

long_qa_task = LongContextQATask()

In [None]:
prompts, refs, crs  = long_qa_task.generate_prompts(num_samples_per_level=10)

In [None]:
crs

In [1]:
from benchmark.benchmark import ModelBenchmark

bm = ModelBenchmark(
    backend="mii",
    model_path="mistralai/Mistral-7B-Instruct-v0.3",
    model_name="Mistral-7B-Instruct-v0.3",
    verbose=False
)

In [2]:
# run_report, detailed_report = bm.run(task="sql", scenario="server", run_time=60, concurrent_users=10, requests_per_user_per_min=10)

run_report, detailed_report = bm.run(task="long_context_qa", scenario="long_context", samples=100)

In [3]:
import pandas as pd

pd.read_csv("/home/ubuntu/fast_llm_inference/results_benchmark/run_report/sglang_long_context_qa_run_report.csv")

Unnamed: 0,model_name,model_size_mb,task,scenario,backend,startup,ttft_sec,coldstart,num_queries_per_context,total_generation_time_s,...,avg_tokens_generated,avg_sentences_generated,avg_ATL,avg_GL,avg_TPS,avg_SPS,avg_energy_per_token,avg_energy_per_sentence,avg_exact_match,avg_F1_score
0,Mistral-7B-Instruct-v0.3,0,long_context_qa,long_context,sglang,34.0776,0.1801,34.2578,100,1614.200698,...,25.348,1.086,0.623155,3.228401,114.82412,2.01868,44.560479,216.584001,0.128,0.293496


In [4]:
run_report

Unnamed: 0,model_name,model_size_mb,task,scenario,backend,startup,ttft_sec,coldstart,num_queries_per_context,total_generation_time_s,...,avg_tokens_generated,avg_sentences_generated,avg_ATL,avg_GL,avg_TPS,avg_SPS,avg_energy_per_token,avg_energy_per_sentence,avg_exact_match,avg_F1_score
0,Mistral-7B-Instruct-v0.3,0,long_context_qa,long_context,mii,226.5272,0.1376,226.6647,100,5704.402642,...,19.714,1.208,1.985272,11.408805,2.67248,0.3456,138.493149,773.620664,0.066,0.187467


In [5]:
import pandas as pd

pd.DataFrame(run_report)
# Save the run report to a CSV file
run_report_df = pd.DataFrame(run_report)
run_report_df.to_csv("/home/ubuntu/fast_llm_inference/results_benchmark/run_report/mii2_long_context_qa_run_report.csv", index=False)

# Save the detailed report to a CSV file
detailed_report_df = pd.DataFrame(detailed_report)
detailed_report_df.to_csv("/home/ubuntu/fast_llm_inference/results_benchmark/details/mii2_long_context_qa_detailed_report.csv", index=False)

In [6]:
detailed_report

Unnamed: 0,context_range,length,successful,prompt,generated_answer,reference_answer,generation_time,tokens_generated,sentences_generated,ATL,GL,TPS,SPS,energy_per_token,energy_per_sentence,exact_match,F1_score
0,3k,3005,True,### SYSTEM\nYou are a question-answering assis...,unanswerable,3606,1.833948,3,1,0.611316,1.833948,1.64,0.55,42.645586,127.936757,0,0.000000
1,3k,2385,True,### SYSTEM\nYou are a question-answering assis...,"BIBREF3, BIBREF4",varied from Maximum Entropy Classifiers (BIBRE...,1.844809,9,1,0.204979,1.844809,4.88,0.54,14.299378,128.694402,0,0.057143
2,3k,2413,True,### SYSTEM\nYou are a question-answering assis...,English,English,1.284135,1,1,1.284135,1.284135,0.78,0.78,89.581618,89.581618,1,1.000000
3,3k,2285,True,### SYSTEM\nYou are a question-answering assis...,C2H,The focus of the study was on the reactive rad...,1.784524,3,1,0.594841,1.784524,1.68,0.56,41.496310,124.488929,0,0.200000
4,3k,1634,True,### SYSTEM\nYou are a question-answering assis...,Yes,Yes,0.949298,1,1,0.949298,0.949298,1.05,1.05,66.223308,66.223308,1,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,32k,22936,False,### SYSTEM\nYou are a question-answering assis...,Internal Server Error,Ruth Honeywill,1.962817,3,1,0.654272,1.962817,1.53,0.51,45.642245,136.926735,0,0.000000
496,32k,17670,False,### SYSTEM\nYou are a question-answering assis...,Internal Server Error,She made a copy of the tape and gives it to he...,1.797285,3,1,0.599095,1.797285,1.67,0.56,41.793038,125.379115,0,0.000000
497,32k,27912,False,### SYSTEM\nYou are a question-answering assis...,Internal Server Error,The nephew of Baron Frederick storms the castl...,1.863499,3,1,0.621166,1.863499,1.61,0.54,43.332743,129.998229,0,0.000000
498,32k,25259,False,### SYSTEM\nYou are a question-answering assis...,Internal Server Error,key to the city,2.005213,3,1,0.668404,2.005213,1.50,0.50,46.628094,139.884281,0,0.000000
