**Cell 1 Description:** Initializes the OpenAI client against the local endpoint and asks for a short story about a king and queen to confirm the chat completions path is wired correctly.


In [None]:
from openai import OpenAI
client = OpenAI(
    base_url="http://localhost:8000/v1"
)

completion = client.chat.completions.create(
    model="/models/Meta-Llama-3-8B-Instruct",
    messages=[
        {"role": "user", "content": "Tell me a short story about king and queen"}
    ]
)

print(completion.choices[0].message)

**Cell 2 Description:** Reuses the same OpenAI client to request only the name of India's capital city, ensuring short-form responses return as expected.


In [None]:
from openai import OpenAI
client = OpenAI(
    base_url="http://localhost:8000/v1"
)

completion = client.chat.completions.create(
    model="qwen3vl",
    messages=[
        {"role": "user", "content": "Answer with just city name. What i capital of india? "}
    ]
)

message = completion.choices[0].message
message_json = json.dumps(message.model_dump(), indent=2, ensure_ascii=False)
print(message_json)

**Cell 3 Description:** Loads a ShareGPT dataset, samples 25 conversations, and sends the first user prompt from each sample to `my-mistral-model` for quick qualitative evaluation.


In [None]:
import json
import random
from openai import OpenAI

# Load ShareGPT data
with open("//home/skamalj/dev/vllm/ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
    data = json.load(f)

# Flatten and filter valid user-only prompts
def extract_user_prompts(convo):
    return [msg["value"] for msg in convo["conversations"] if msg["from"] == "human"]

# Sample 100 random prompts
random_samples = random.sample(data, 25)

# Prepare OpenAI client (local server)
client = OpenAI(base_url="http://localhost:8000/v1")

# Iterate over samples
for idx, sample in enumerate(random_samples):
    prompts = extract_user_prompts(sample)
    if not prompts:
        continue

    # Take only the **first** user prompt per sample
    first_prompt = prompts[0].strip()
    print(f"Processing sample #{idx + 1} with prompt: {first_prompt}")

    try:
        completion = client.chat.completions.create(
            model="my-mistral-model",
            messages=[{"role": "user", "content": first_prompt}]
        )

        print(f"Sample #{idx + 1}")
        print("Prompt:", first_prompt)
        print("Response:", completion.choices[0].message.content.strip())
        print("-" * 50)

    except Exception as e:
        print(f"Error on sample #{idx + 1}: {e}")


**Cell 4 Description:** Provides a reminder about where to download the ShareGPT dataset so the preceding sampling scripts have data available locally.


> Download  data file from [here](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered)

**Cell 5 Description:** Placeholder cell intended for future experiments, currently left empty for ad-hoc commands.


**Cell 6 Description:** Loads another ShareGPT shard and fans out multiple prompts in parallel to a local `llama` deployment to observe batching behavior via the OpenAI-compatible API.


In [None]:
import json
import random
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load ShareGPT data
with open("/home/skamalj/dev/hfdata/ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
    data = json.load(f)

# Flatten and filter valid user-only prompts
def extract_user_prompts(convo):
    return [msg["value"] for msg in convo["conversations"] if msg["from"] == "human"]

# Sample 100 random prompts
random_samples = random.sample(data, 10)
client = OpenAI(base_url="http://localhost:8000/v1")

# Prepare request payloads
requests = []
for sample in random_samples:
    prompts = extract_user_prompts(sample)
    if prompts:
        requests.append(prompts[0].strip())

# Function to run a single request
def run_request(idx, prompt):
    try:
        completion = client.chat.completions.create(
            model="llama",
            messages=[{"role": "user", "content": prompt}]
        )
        return idx, prompt, completion.choices[0].message.content.strip()
    except Exception as e:
        return idx, prompt, f"ERROR: {e}"

# Run in parallel to trigger batching in vLLM
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(run_request, idx, prompt) for idx, prompt in enumerate(requests)]
    for future in as_completed(futures):
        idx, prompt, response = future.result()
        print(f"Sample #{idx + 1}")
        print("Prompt:", prompt)
        print("Response:", response)
        print("-" * 50)


In [None]:
import json
from pathlib import Path

import yaml  # pip install pyyaml if missing

yaml_path = Path("litecone.yaml")

with yaml_path.open("r", encoding="utf-8") as handle:
    content = yaml.safe_load(handle)

print(json.dumps(content, indent=2))



In [None]:
from openai import OpenAI
client = OpenAI(
    base_url="http://localhost:8000/v1"
)

completion = client.chat.completions.create(
    model="qwen3vl",
    messages=content["messages"]
)

message = completion.choices[0].message
message_dict = message.model_dump()          # or json.loads(message_json)
match = re.search(r"```json\s*(.*?)\s*```", message_dict["content"], re.DOTALL)
if not match:
    raise ValueError("No JSON block found in content")

payload = match.group(1)
parsed = json.loads(payload)
print(json.dumps(parsed, indent=2, ensure_ascii=False))

In [None]:
message_dict = message.model_dump()          # or json.loads(message_json)
match = re.search(r"```json\s*(.*?)\s*```", message_dict["content"], re.DOTALL)
if not match:
    raise ValueError("No JSON block found in content")

payload = match.group(1)
parsed = json.loads(payload)
print(json.dumps(parsed, indent=2, ensure_ascii=False))

In [24]:
import asyncio
import nest_asyncio
import json
import re
import os
from openai import OpenAI

nest_asyncio.apply()

client = OpenAI(base_url="http://localhost:8000/v1")

os.makedirs("output", exist_ok=True)

# CONFIG
PER_CALL_TIMEOUT = 240   # seconds


async def run_single_call(i, messages, timeout=PER_CALL_TIMEOUT):

    def sync_call():
        # NOTE: OpenAI Python library does NOT support timeout arg on create()
        # So timeout is handled at asyncio level, not inside this call.
        completion = client.chat.completions.create(
            model="qwen3vl",
            messages=messages,
        )
        return completion.choices[0].message.model_dump()["content"]


    try:
        # Apply per-call timeout
        content = await asyncio.wait_for(
            asyncio.to_thread(sync_call),
            timeout=timeout
        )

        # Try extracting JSON block
        match = re.search(r"```json\s*(.*?)\s*```", content, re.DOTALL)
        if not match:
            print(f"✗ No JSON block found for {i}")
            return None

        parsed = json.loads(match.group(1))

        # Save to file
        with open(f"output/output_{i}.json", "w", encoding="utf-8") as f:
            json.dump(parsed, f, indent=2, ensure_ascii=False)

        print(f"✓ Completed {i}")
        return parsed

    except asyncio.TimeoutError:
        print(f"⏳ Timeout ({timeout}s) on task {i}")
        return None

    except Exception as e:
        print(f"✗ Error in task {i}: {e}")
        return None



async def main():
    tasks = [
        run_single_call(i, content["messages"])
        for i in range(50)
    ]
    results = await asyncio.gather(*tasks, return_exceptions=False)
    print("All tasks finished.")
    return results


await main()


✓ Completed 6
✓ Completed 15
✓ Completed 3
✓ Completed 9
✓ Completed 17
✓ Completed 5
✓ Completed 4
✓ Completed 10
✓ Completed 16
✓ Completed 7
✓ Completed 8
✓ Completed 1
✓ Completed 0
✓ Completed 11
✓ Completed 12
✓ Completed 2
✓ Completed 13
✓ Completed 20
✓ Completed 23
✓ Completed 25
✓ Completed 21
✓ Completed 18
✓ Completed 27
✓ Completed 22
✓ Completed 19
✓ Completed 26
✓ Completed 30
✓ Completed 28
✓ Completed 32
✓ Completed 31
✓ Completed 29
✓ Completed 24
✓ Completed 34
✓ Completed 36
✓ Completed 33
✓ Completed 40
✓ Completed 38
✓ Completed 35
✓ Completed 37
✓ Completed 43
✓ Completed 42
✓ Completed 41
✓ Completed 44
✓ Completed 47
✓ Completed 39
✓ Completed 48
✓ Completed 46
✓ Completed 45
✓ Completed 49
⏳ Timeout (240s) on task 14
All tasks finished.


[{'GeneralEvidenceRequirement': {'required_evidences': [{'document_name': 'Identity Proof',
     'document_category': 'Core',
     'required_from': 'Life Assured',
     'reason_required': 'Identity proof is mandatory as per AML/KYC guidelines. The applicant has declared Aadhaar as their identity proof, which is a valid document type. Aadhaar is accepted as a standard identity proof under Section 6.2 of the guidelines.',
     'is_pending': True,
     'valid_doc_classes': ['aadhar',
      'pan',
      'voter_id',
      'driving_licence',
      'passport',
      'government_id_card',
      'defense_id_card',
      'passbook',
      'birth_certificate',
      'recent_photograph'],
     'doc_path': None},
    {'document_name': 'Age Proof',
     'document_category': 'Core',
     'required_from': 'Life Assured',
     'reason_required': 'Age proof is mandatory as per Section 6.2 of the guidelines. The applicant has declared Aadhaar as their age proof, which is a valid document type. Aadhaar is