In [41]:
# 🧪 Launch vLLM server from within a Jupyter notebook

import subprocess

# define your vllm launch command
vllm_command = [
    "python",
    "-m", "vllm.entrypoints.openai.api_server",
    "--model", "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",
    "--port", "8000",
    "--max-model-len", "5000",
    "--dtype", "float16"
]

# run it in the background so the notebook stays responsive
vllm_process = subprocess.Popen(vllm_command)

print("✅ vLLM server started on http://localhost:8000")

# optional: keep track of the process
vllm_process


✅ vLLM server started on http://localhost:8000


<Popen: returncode: None args: ['python', '-m', 'vllm.entrypoints.openai.api...>

INFO 07-03 19:35:55 [__init__.py:244] Automatically detected platform cuda.
INFO 07-03 19:35:59 [api_server.py:1287] vLLM API server version 0.9.1
INFO 07-03 19:35:59 [cli_args.py:309] non-default args: {'model': 'RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', 'dtype': 'float16', 'max_model_len': 5000}
INFO 07-03 19:36:08 [config.py:823] This model supports multiple tasks: {'classify', 'embed', 'reward', 'score', 'generate'}. Defaulting to 'generate'.
INFO 07-03 19:36:08 [gptq_marlin.py:145] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel.
INFO 07-03 19:36:08 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 07-03 19:36:13 [__init__.py:244] Automatically detected platform cuda.
INFO 07-03 19:36:16 [core.py:455] Waiting for init message from front-end.
INFO 07-03 19:36:16 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', speculative_config=N

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.43it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.43it/s]



INFO 07-03 19:36:18 [default_loader.py:272] Loading weights took 0.86 seconds
INFO 07-03 19:36:19 [gpu_model_runner.py:1624] Model loading took 5.3542 GiB and 1.419753 seconds
INFO 07-03 19:36:27 [backends.py:462] Using cache directory: /home/ec2-user/.cache/vllm/torch_compile_cache/54d077ce5e/rank_0_0 for vLLM's torch.compile
INFO 07-03 19:36:27 [backends.py:472] Dynamo bytecode transform time: 8.06 s
INFO 07-03 19:36:33 [backends.py:135] Directly load the compiled graph(s) for shape None from the cache, took 5.164 s
INFO 07-03 19:36:34 [monitor.py:34] torch.compile takes 8.06 s in total
INFO 07-03 19:36:35 [gpu_worker.py:227] Available KV cache memory: 33.34 GiB
INFO 07-03 19:36:35 [kv_cache_utils.py:715] GPU KV cache size: 273,104 tokens
INFO 07-03 19:36:35 [kv_cache_utils.py:719] Maximum concurrency for 5,000 tokens per request: 54.53x
INFO 07-03 19:36:59 [gpu_model_runner.py:2048] Graph capturing finished in 24 secs, took 0.60 GiB
INFO 07-03 19:36:59 [core.py:171] init engine (pro

INFO:     Started server process [173023]
INFO:     Waiting for application startup.
INFO:     Application startup complete.


In [6]:
!git clone https://github.com/openai/human-eval.git

Cloning into 'human-eval'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 34 (delta 11), reused 4 (delta 4), pack-reused 9 (from 2)[K
Receiving objects: 100% (34/34), 55.87 KiB | 3.99 MiB/s, done.
Resolving deltas: 100% (12/12), done.


In [10]:
!pip3 install -e human-eval

Obtaining file:///home/ec2-user/my_notebooks/human-eval
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: human-eval
  Attempting uninstall: human-eval
    Found existing installation: human-eval 1.0
    Uninstalling human-eval-1.0:
      Successfully uninstalled human-eval-1.0
  Running setup.py develop for human-eval
Successfully installed human-eval-1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [18]:
from openai import OpenAI

client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")


In [44]:
import json
from human_eval.data import read_problems
from human_eval.execution import check_correctness

# Load HumanEval prompts
problems = read_problems()

# track results
results = []

for idx, (task_id, task) in enumerate(problems.items(), start=1):
    prompt = task["prompt"].lstrip()

    response = client.completions.create(
        model="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",
        prompt=prompt,
        max_tokens=512,
        temperature=0.0,
        top_p=1.0,
        stop=["\nclass", "\ndef", "\n#"]
    )

    completion = response.choices[0].text

    # validate correctness
    result = check_correctness(problem=task, completion=completion, timeout=3)

    results.append({
        "task_id": task_id,
        "passed": result["passed"],
        "result": result["result"]
    })

    # count passes so far
    pass_count = sum(r["passed"] for r in results)
    running_pass_rate = pass_count / idx

    print(
        f"{task_id}: {'✅' if result['passed'] else '❌'} | "
        f"running pass@1: {running_pass_rate:.3f} ({pass_count}/{idx})"
    )

# final summary
total = len(results)
final_pass_rate = pass_count / total
print(f"\nFinal Pass@1: {final_pass_rate:.3f} ({pass_count}/{total})")


INFO 07-03 19:37:55 [logger.py:43] Received request cmpl-23f63f2360974a688113bbe980e95f67-0: prompt: 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=['\nclass', '\ndef', '\n#'], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=512, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: [128000, 1527, 20061, 1179, 1796, 1432, 755, 