In [1]:
from dotenv import load_dotenv
from pathlib import Path
from datetime import datetime, timezone
import json
import os
import time

from litellm import completion
from litellm.exceptions import RateLimitError, APIConnectionError, ServiceUnavailableError

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
load_dotenv(PROJECT_ROOT / ".env")

MODEL_NAME = "gemini/gemini-2.5-flash-lite"
KEY_NAMES = [f"GEMINI_API_KEY{i}" for i in range(1, 6)]
API_KEYS = [os.getenv(k) for k in KEY_NAMES if os.getenv(k)]

if not API_KEYS:
    raise ValueError("No GEMINI_API_KEY1..5 keys found in .env")

CHUNKS_PATH = PROJECT_ROOT / "chunks.json"
OUT_DIR = PROJECT_ROOT / "data" / "dataset" / "_processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHECKPOINT_JSONL = OUT_DIR / "qa_pairs_checkpoint.jsonl"
PROGRESS_JSON = OUT_DIR / "qa_pairs_progress.json"
ERRORS_JSONL = OUT_DIR / "qa_pairs_errors.jsonl"

MAX_PAIRS = 50        # set to None to run full chunks file
MIN_TEXT_CHARS = 120  # skip tiny chunks
MAX_KNOWLEDGE_CHARS = 3500
MIN_SECONDS_BETWEEN_CALLS_PER_KEY = 3.5
COOLDOWN_ON_RATE_LIMIT_SEC = 35
MAX_ATTEMPTS_PER_CALL = 20

QUESTION_PROMPT = (
    "Generate one conversation initiating statement in English/Hinglish based on this knowledge. "
    "Use varied starts like why/when/where/how, and keep it natural and specific.\n\n"
    "KNOWLEDGE:\n{knowledge}"
)

ANSWER_PROMPT = (
    "Using the knowledge, answer in 2 concise informative lines in English. "
    "Then ask one thoughtful follow-up question in English/Hinglish. "
    "Do not generate extra turns.\n\n"
    "KNOWLEDGE:\n{knowledge}\n\n"
    "USER QUESTION:\n{question}"
)

print("MODEL:", MODEL_NAME)
print("KEYS FOUND:", len(API_KEYS))
print("CHUNKS PATH:", CHUNKS_PATH)
print("CHECKPOINT:", CHECKPOINT_JSONL)



MODEL: gemini/gemini-2.5-flash-lite
KEYS FOUND: 5
CHUNKS PATH: c:\Users\Pranav\Desktop\proj\github_projects\knowsLM_implementation\chunks.json
CHECKPOINT: c:\Users\Pranav\Desktop\proj\github_projects\knowsLM_implementation\data\dataset\_processed\qa_pairs_checkpoint.jsonl


In [2]:
class KeyRotator:
    def __init__(self, keys: list[str], min_interval_sec: float = 3.5):
        now = time.time()
        self.state = [
            {
                "key": k,
                "next_ready": now,
                "fail_count": 0,
            }
            for k in keys
        ]
        self.idx = 0
        self.min_interval_sec = min_interval_sec

    def acquire(self) -> dict:
        while True:
            now = time.time()
            candidates = [s for s in self.state if s["next_ready"] <= now]
            if candidates:
                pick = self.state[self.idx % len(self.state)]
                self.idx += 1
                if pick["next_ready"] <= now:
                    return pick
            next_ready = min(s["next_ready"] for s in self.state)
            sleep_for = max(0.05, next_ready - now)
            time.sleep(sleep_for)

    def mark_success(self, slot: dict):
        slot["fail_count"] = 0
        slot["next_ready"] = time.time() + self.min_interval_sec

    def mark_rate_limited(self, slot: dict, cooldown_sec: int = 35):
        slot["fail_count"] += 1
        slot["next_ready"] = time.time() + cooldown_sec

    def mark_transient_error(self, slot: dict):
        slot["fail_count"] += 1
        # Short retry penalty for network/transient issues
        slot["next_ready"] = time.time() + 4


def load_chunks(chunks_path: Path) -> list[dict]:
    with chunks_path.open(encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError("chunks.json must be a list of chunk objects")
    return data


def load_done_ids(path: Path) -> set[int]:
    done = set()
    if not path.exists():
        return done
    with path.open(encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            try:
                row = json.loads(line)
                done.add(int(row["chunk_index"]))
            except Exception:
                continue
    return done


def append_jsonl(path: Path, row: dict):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")


def write_progress(path: Path, payload: dict):
    path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")



In [3]:
def call_model_with_fallback(prompt: str, rotator: KeyRotator, temperature: float = 0.5) -> tuple[str, str]:
    last_error = None

    for _ in range(MAX_ATTEMPTS_PER_CALL):
        slot = rotator.acquire()
        key = slot["key"]

        try:
            resp = completion(
                model=MODEL_NAME,
                api_key=key,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                timeout=90,
            )
            text = resp.choices[0].message.content.strip()
            rotator.mark_success(slot)
            return text, key

        except RateLimitError as e:
            last_error = e
            rotator.mark_rate_limited(slot, cooldown_sec=COOLDOWN_ON_RATE_LIMIT_SEC)
            continue

        except (APIConnectionError, ServiceUnavailableError) as e:
            last_error = e
            rotator.mark_transient_error(slot)
            continue

        except Exception as e:
            last_error = e
            rotator.mark_transient_error(slot)
            continue

    raise RuntimeError(f"All keys failed after retries. Last error: {last_error}")



In [4]:
chunks = load_chunks(CHUNKS_PATH)
done_ids = load_done_ids(CHECKPOINT_JSONL)
rotator = KeyRotator(API_KEYS, min_interval_sec=MIN_SECONDS_BETWEEN_CALLS_PER_KEY)

print(f"Loaded chunks: {len(chunks)}")
print(f"Already completed: {len(done_ids)}")

created = 0
started_at = datetime.now(timezone.utc).isoformat()

for idx, row in enumerate(chunks):
    if idx in done_ids:
        continue

    if MAX_PAIRS is not None and created >= MAX_PAIRS:
        break

    knowledge = (row.get("content") or "").strip()
    if len(knowledge) < MIN_TEXT_CHARS:
        continue

    knowledge = knowledge[:MAX_KNOWLEDGE_CHARS]

    try:
        q_prompt = QUESTION_PROMPT.format(knowledge=knowledge)
        question, q_key = call_model_with_fallback(q_prompt, rotator, temperature=0.7)

        a_prompt = ANSWER_PROMPT.format(knowledge=knowledge, question=question)
        answer, a_key = call_model_with_fallback(a_prompt, rotator, temperature=0.4)

        out = {
            "chunk_index": idx,
            "source_metadata": row.get("metadata", {}),
            "question": question,
            "answer": answer,
            "question_key": q_key[-6:],
            "answer_key": a_key[-6:],
            "created_at": datetime.now(timezone.utc).isoformat(),
        }
        append_jsonl(CHECKPOINT_JSONL, out)
        created += 1

        if created % 5 == 0:
            write_progress(
                PROGRESS_JSON,
                {
                    "started_at": started_at,
                    "last_update": datetime.now(timezone.utc).isoformat(),
                    "created_this_run": created,
                    "total_done": len(load_done_ids(CHECKPOINT_JSONL)),
                    "checkpoint": str(CHECKPOINT_JSONL),
                },
            )
            print(f"Created {created} pairs this run")

    except Exception as e:
        append_jsonl(
            ERRORS_JSONL,
            {
                "chunk_index": idx,
                "error": str(e),
                "at": datetime.now(timezone.utc).isoformat(),
            },
        )
        continue

final_done = len(load_done_ids(CHECKPOINT_JSONL))
write_progress(
    PROGRESS_JSON,
    {
        "started_at": started_at,
        "last_update": datetime.now(timezone.utc).isoformat(),
        "created_this_run": created,
        "total_done": final_done,
        "checkpoint": str(CHECKPOINT_JSONL),
        "errors_file": str(ERRORS_JSONL),
    },
)

print("Run complete")
print("Created this run:", created)
print("Total checkpointed:", final_done)
print("Checkpoint file:", CHECKPOINT_JSONL)



Loaded chunks: 203
Already completed: 35

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.

Created 5 pairs this run

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to d

KeyboardInterrupt: 

In [None]:
from huggingface_hub import login, upload_folder


login()


upload_folder(folder_path=".", repo_id="pranavpande01/delhi-food", repo_type="dataset")


In [3]:
import asyncio
from crawl4ai import AsyncWebCrawler

async def main():
    # Create an instance of AsyncWebCrawler
    async with AsyncWebCrawler() as crawler:
        # Run the crawler on a URL
        result = await crawler.arun(url="https://crawl4ai.com")

        # Print the extracted content
        print(result.markdown)

# Run the async main function
asyncio.run(main())




RuntimeError: asyncio.run() cannot be called from a running event loop