# Running small Experiments using Colab

In [1]:
!git clone https://github.com/paulkroe/minireason.git
%cd minireason
!ls

Cloning into 'minireason'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 25 (delta 6), reused 20 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 39.72 KiB | 3.31 MiB/s, done.
Resolving deltas: 100% (6/6), done.
/content/minireason
colab.ipynb  datasets  README.md  requirements.txt  sampling  utils


In [2]:
!pip install uv
!uv pip install sgl-kernel --force-reinstall --no-deps --system
!uv pip install -r requirements.txt --system
!uv pip install "sglang[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer --system

Collecting uv
  Downloading uv-0.6.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.6.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.6.0
[2mUsing Python 3.11.11 environment at: /usr[0m
[2K[2mResolved [1m1 package[0m [2min 50ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[

In [3]:
# load data
!python3 datasets/download_gsm8k.py

README.md: 100% 7.94k/7.94k [00:00<00:00, 38.3MB/s]
train-00000-of-00001.parquet: 100% 2.31M/2.31M [00:00<00:00, 28.8MB/s]
test-00000-of-00001.parquet: 100% 419k/419k [00:00<00:00, 81.3MB/s]
Generating train split: 100% 7473/7473 [00:00<00:00, 166678.37 examples/s]
Generating test split: 100% 1319/1319 [00:00<00:00, 259939.25 examples/s]
Saved train split with 7473 samples.
Saved test split with 1319 samples.


In [4]:
import sglang as sgl
from google.colab import userdata
from huggingface_hub import login
login(userdata.get('HF_TOKEN'))

In [5]:
llm = sgl.Engine(model_path="NousResearch/Meta-Llama-3.1-8B-Instruct")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

INFO 02-16 18:35:54 __init__.py:190] Automatically detected platform cuda.


2025-02-16 18:35:57,685 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [9]:
import asyncio
import sglang as sgl
import json
from utils.gsm8k_answer_checker import gsm8k_answer_checker as answer_checker
from tqdm.notebook import tqdm

async def async_evaluate(questions, answers):
    answers = await answer_checker.check_answers(questions, answers)
    await asyncio.sleep(0.1)
    return {f"answer_{i+1}": answer for i, answer in enumerate(answers)}

async def run_batch(llm, questions, ground_truths, sampling_params, semaphore, results_queue):
    async with semaphore:
        n = sampling_params['n']
        answers = await llm.async_generate(questions, sampling_params)
        structured_output = []
        for i, q in enumerate(questions):
            structured_output.append({
                "question": q,
                "ground_truth": ground_truths[i],
                "answers": answers[i * n:(i + 1) * n]
            })
        await results_queue.put(structured_output)

async def generate_and_evaluate(llm, questions, ground_truths, sampling_params, async_evaluate, batch_size=16):
    semaphore = asyncio.Semaphore(batch_size)  # Control concurrent GPU tasks
    results_queue = asyncio.Queue()

    async def generate():
        tasks = []
        for i in tqdm(range(0, len(questions), batch_size), total=(len(questions) + batch_size - 1) // batch_size):
            q_batch = questions[i : i + batch_size]
            t_batch = ground_truths[i : i + batch_size]
            tasks.append(run_batch(llm, q_batch, t_batch, sampling_params, semaphore, results_queue))
        await asyncio.gather(*tasks)
        await results_queue.put(None)  # Signal end of generation

    async def evaluate():
        all_results = []
        while True:
            batch_output = await results_queue.get()
            if batch_output is None:
                break
            batch_output = await answer_checker.eval(batch_output)
            all_results.extend(batch_output)

        # Write results to JSON file
        with open('evaluation_results.json', 'w') as f:
            json.dump(all_results, f, indent=4)

    await asyncio.gather(generate(), evaluate())

In [7]:
def gen_question(question):
    prompt = f"""
You are a helpful assistant solving math problems. Solve problems step by step using the following format:

1. Put your step-by-step solution inside <think> tags, explaining each step clearly.
2. Verify your final answer whenever possible.
3. Provide the final answer in a \\boxed{{}} tag in a simplified and clear format.

Example 1:
Lucy has 18 apples. She gives 4 apples to her friend. She then doubles the number of apples she has. How many apples does Lucy have left?
<think>
1. Subtract the apples Lucy gave away: 18 - 4 = 14
2. Double the remaining apples: 14 * 2 = 28
</think>
\\boxed{{28}}
<|endoftext|>

Example 2:
What is the value of (3 + 5) * 2?
<think>
1. Add 3 and 5 to get 8.
2. Multiply the result by 2: 8 * 2 = 16
</think>
\\boxed{{16}}
<|endoftext|>

Question:
{question}

Answer:
"""
    return prompt


In [10]:
import json

# Load questions from JSON
def load_questions(json_path):
    with open(json_path, 'r') as f:
        return json.load(f)
test_data = load_questions('datasets/gsm8k_train.json')
questions, ground_truths = [], []
for d in test_data:
    questions.append(gen_question(d['question']))
    ground_truths.append(d['answer'])

# sampling_params
sampling_params = {
    "temperature": 0.7,
    "top_p": 0.95,
    "n": 4,
    "max_new_tokens": 600,
    "stop": ["<|endoftext|>"]
}
asyncio.run(generate_and_evaluate(llm, questions, ground_truths, sampling_params, async_evaluate, batch_size=16))

  0%|          | 0/468 [00:00<?, ?it/s]