<a href="https://colab.research.google.com/github/paulkroe/minireason/blob/main/colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Running small Experiments using Colab

In [2]:
!git clone https://github.com/paulkroe/minireason.git
%cd minireason
!ls

Cloning into 'minireason'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 14 (delta 1), reused 14 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (14/14), 4.33 KiB | 4.33 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/minireason
colab.ipynb  datasets  README.md  requirements.txt  sampling  utils


In [3]:
!pip install uv
!uv pip install sgl-kernel --force-reinstall --no-deps --system
!uv pip install -r requirements.txt --system
!uv pip install "sglang[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer --system

Collecting uv
  Downloading uv-0.6.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.6.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.6.0
[2mUsing Python 3.11.11 environment at: /usr[0m
[2K[2mResolved [1m1 package[0m [2min 265ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙

In [4]:
# load data
!python3 datasets/download_gsm8k.py

README.md: 100% 7.94k/7.94k [00:00<00:00, 37.0MB/s]
train-00000-of-00001.parquet: 100% 2.31M/2.31M [00:00<00:00, 55.2MB/s]
test-00000-of-00001.parquet: 100% 419k/419k [00:00<00:00, 146MB/s]
Generating train split: 100% 7473/7473 [00:00<00:00, 192727.44 examples/s]
Generating test split: 100% 1319/1319 [00:00<00:00, 308842.01 examples/s]
Saved train split with 7473 samples.
Saved test split with 1319 samples.


In [5]:
import sglang as sgl
from google.colab import userdata
from huggingface_hub import login
login(userdata.get('HF_TOKEN'))

In [6]:
llm = sgl.Engine(model_path="NousResearch/Meta-Llama-3.1-8B-Instruct")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

INFO 02-16 05:47:57 __init__.py:190] Automatically detected platform cuda.


2025-02-16 05:48:01,819 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [32]:
import asyncio
import sglang as sgl
import json

async def async_evaluate(questions, answers):
    await asyncio.sleep(0.1)
    return {f"answer_{i+1}": answer for i, answer in enumerate(answers)}


async def run_batch(llm, questions, sampling_params, semaphore, results_queue):
    async with semaphore:
        answers = await llm.async_generate(questions, sampling_params)
        for answer in answers:
            print("====================")
            print(answer)
            print("====================")
        structured_output = [{"question": q, "answers": answers[i]} for i, q in enumerate(questions)]
        await results_queue.put(structured_output)

async def generate_and_evaluate(llm, questions, answers, sampling_params, async_evaluate, batch_size=16):
    semaphore = asyncio.Semaphore(batch_size)  # Control concurrent GPU tasks
    results_queue = asyncio.Queue()

    async def generate():
        tasks = []
        for i in range(0, len(questions), batch_size):
            batch = questions[i : i + batch_size]
            tasks.append(run_batch(llm, batch, sampling_params, semaphore, results_queue))
        await asyncio.gather(*tasks)
        await results_queue.put(None)  # Signal end of generation

    async def evaluate():
        all_results = []
        while True:
            batch_output = await results_queue.get()
            if batch_output is None:
                break
            for entry in batch_output:
                eval_result = await async_evaluate(entry['question'], entry['answers'])
                structured_result = {
                    "question": entry['question'],
                    "answers": [ans for ans in entry['answers']],
                    "evaluation": eval_result
                }
                all_results.append(structured_result)
                print(json.dumps(structured_result, indent=4))

        # Write results to JSON file
        with open('evaluation_results.json', 'w') as f:
            json.dump(all_results, f, indent=4)

    await asyncio.gather(generate(), evaluate())

In [33]:
def gen_question(question):
    prompt = f"""
You are a helpful assistant solving math problems. Solve problems step by step using the following format:

1. Put your step-by-step solution inside <think> tags, explaining each step clearly.
2. Verify your final answer whenever possible.
3. Provide the final answer in a \\boxed{{}} tag in a simplified and clear format.

Example 1:
Lucy has 18 apples. She gives 4 apples to her friend. She then doubles the number of apples she has. How many apples does Lucy have left?
<think>
1. Subtract the apples Lucy gave away: 18 - 4 = 14
2. Double the remaining apples: 14 * 2 = 28
</think>
\\boxed{{28}}
<|endoftext|>

Example 2:
What is the value of (3 + 5) * 2?
<think>
1. Add 3 and 5 to get 8.
2. Multiply the result by 2: 8 * 2 = 16
</think>
\\boxed{{16}}
<|endoftext|>

Question:
{question}

Answer:
"""
    return prompt


In [35]:
import json

# Load questions from JSON
def load_questions(json_path):
    with open(json_path, 'r') as f:
        return json.load(f)
test_data = load_questions('datasets/gsm8k_test.json')[:2]
questions, answers = [], []
for d in test_data:
    questions.append(gen_question(d['question']))
    answers.append(d['answer'])

# sampling_params
sampling_params = {
    "temperature": 0.7,
    "top_p": 0.95,
    "n": 2,
    "max_new_tokens": 600,
    "stop": ["<|endoftext|>"]  # Stop at double newlines or end token
}
asyncio.run(generate_and_evaluate(llm, questions, answers, sampling_params, async_evaluate, batch_size=16))

{'text': '<think>\n1. Calculate the total number of eggs laid per day: 16 eggs.\n2. Calculate the total number of eggs eaten and used in muffins per day: 3 + 4 = 7 eggs.\n3. Calculate the number of eggs left to sell: 16 - 7 = 9 eggs.\n4. Calculate the amount of money made by selling 9 eggs at $2 each: 9 * 2 = $18.\n</think>\n\\boxed{18} ', 'meta_info': {'id': '45bbe04233ea447484b7953ac1ee9610', 'finish_reason': {'type': 'stop', 'matched': '<|endoftext|>'}, 'prompt_tokens': 294, 'completion_tokens': 109, 'cached_tokens': 293}}
{'text': '<think>\n1. Calculate the total number of eggs laid per day: 16 eggs/day\n2. Calculate the number of eggs eaten and used: 3 eggs (for breakfast) + 4 eggs (for baking) = 7 eggs/day\n3. Calculate the remaining eggs for sale: 16 eggs/day - 7 eggs/day = 9 eggs/day\n4. Calculate the total daily income from selling eggs: 9 eggs/day * $2/egg = $18/day\n</think>\n\\boxed{$18} ', 'meta_info': {'id': '14e086155c02459389f4de3a8919a18d', 'finish_reason': {'type': 's