In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

def load_bnb_model(model_path: str, quant_bits: int = 4, compute_dtype: str = "bfloat16"):
    """
    Load a pre-quantized BitsAndBytes model for inference.

    Parameters:
        model_path (str): Path to the quantized model directory.
        quant_bits (int): 4 or 8 depending on the quantization type used during save.
        compute_dtype (str): Compute type for 4-bit ("bfloat16" or "float16").

    Returns:
        pipeline: Hugging Face text-generation pipeline.
    """
    assert quant_bits in [4, 8], "Only 4-bit and 8-bit quantization supported."

    if quant_bits == 4:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=compute_dtype
        )
    else:
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True
        )

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        quantization_config=bnb_config,
        trust_remote_code=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_path)

    return pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")


2025-04-22 09:39:36.436412: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745314776.454859  135795 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745314776.460469  135795 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745314776.474838  135795 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745314776.474856  135795 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745314776.474857  135795 computation_placer.cc:177] computation placer alr

[2025-04-22 09:39:40,849] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
pipe = load_bnb_model("/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct-quantizised/llama-3.1-8B-8bit", quant_bits=8)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [None]:
output = pipe("What is quantum computing?", max_new_tokens=100)[0]["generated_text"]
print(output)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What is quantum computing? - Part 2: Quantum algorithms
In the previous post, we discussed the basics of quantum computing and the principles that underlie it. In this post, we'll explore the concept of quantum algorithms and their potential to solve complex problems that are currently unsolvable or inefficiently solvable using classical computers.
Quantum algorithms are designed to take advantage of the unique properties of quantum mechanics, such as superposition, entanglement, and interference. These algorithms are typically designed to solve specific problems, such


: 

In [1]:
from datasets import load_dataset
# Load test data

dataset = load_dataset("gigaword", split="test[:3]")  # Limit to 100 for fast eval

In [2]:
def sum_prompt(document):
    """
    Summarize the given `document` into a concise headline using a few-shot prompt.
    """
    prompt = (
        "You are a headline generation assistant. Given a news article, produce a concise and informative headline.\n\n"

        "Here are some examples:\n"

        "News: Scientists have discovered a new exoplanet that appears to have water on its surface, raising hopes it may be habitable.\n"
        "Headline: New exoplanet may support life\n\n"

        "News: The stock market experienced a significant downturn today, with major indices falling sharply amid economic uncertainty.\n"
        "Headline: Stock market plunges amid economic fears\n\n"

        "Generate just the answer, without repeating the question. Now it's your turn:\n\n"

        f"News: {document}\n"
        "Headline:"
    )

    return prompt


print(sum_prompt(dataset[0]["document"]))

You are a headline generation assistant. Given a news article, produce a concise and informative headline.

Here are some examples:
News: Scientists have discovered a new exoplanet that appears to have water on its surface, raising hopes it may be habitable.
Headline: New exoplanet may support life

News: The stock market experienced a significant downturn today, with major indices falling sharply amid economic uncertainty.
Headline: Stock market plunges amid economic fears

Generate just the answer, without repeating the question. Now it's your turn:

News: japan 's nec corp. and UNK computer corp. of the united states said wednesday they had agreed to join forces in supercomputer sales .
Headline:


In [None]:
from benchmark.benchmark import ModelBenchmark

model_name = "Teuken-7B-instruct-research-v0.4"

bm = ModelBenchmark(
    backend="vllm",
    model_path=f"/home/ubuntu/fast_llm_inference/{model_name}",
    task="summarization"
)

# Run the benchmark

results = bm.run(samples=100)

In [2]:
import pandas as pd

# Convert results to DataFrame
results = pd.DataFrame(results)

# Save to CSV
results.to_csv(f"vLLM_results/{model_name}_summarization.csv", index=False)

# Compute statistics
numeric_results = results.select_dtypes(include='number')
averages = numeric_results.mean()
stds = numeric_results.std()

# Combine mean ± std into a formatted string
summary = averages.combine(stds, lambda mean, std: f"{mean:.6f} ± {std:.6f}")

# Print formatted summary
print(f"Statistics (mean ± std) for {model_name}:")
print(summary)

Statistics (mean ± std) for llama-3.1-8B-Instruct:
prompt_length                       4143.850000 ± 2039.573492
TTFT                                      0.069163 ± 0.001972
ATL                                       0.104923 ± 0.031886
GL                                        1.144052 ± 0.138419
TPS                                      11.054800 ± 2.240038
SPS                                       1.015800 ± 0.445172
Memory Usage (MB)                     21341.880000 ± 0.000000
Model Size (MB)                       15327.360256 ± 0.000000
Overhead (MB)                          6014.519744 ± 0.000000
GPU_Utilization (%)                      95.810000 ± 7.636561
Total Energy (Wh)                         0.021285 ± 0.004518
Energy per Token (J/token)                6.274305 ± 1.621546
Energy per Sentence (J/sentence)        72.789601 ± 20.600268
Energy per Second (W)                    66.472600 ± 9.277863
ROUGE-1                                   0.250233 ± 0.133198
ROUGE-L            