In [None]:
def factual_answers(prompt, collection):
    query_embedding = embedding_model.encode(prompt)
    doc_results = collection.query(query_embeddings=query_embedding, n_results=5)

    if not doc_results or not doc_results.get("documents") or not doc_results['documents'][0]:
        print("No documents found.")
        return None, None

    full_context = "\n".join(doc_results['documents'][0])
    augmented_prompt = [
        {"role": "system", "content": "You are a helpful AI assistant specialized in answering questions about harmful side-effects of ingredients in cosmetics. Context: " + full_context},
        {"role": "user", "content": prompt}
    ]

    input_ids = tokenizer.apply_chat_template(
        augmented_prompt,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    text_streamer = TextStreamer(tokenizer, skip_prompt=True)
    print("Generating output:..")

    output = model.generate(
        input_ids,
        streamer=text_streamer,
        max_new_tokens=128,
        pad_token_id=tokenizer.eos_token_id,
        use_cache=True,
        temperature=0.1,
        min_p=0.1
    )

    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True).split("assistant\n")[-1].strip()
    return decoded_output, doc_results['documents'][0]  # return both response and context


  # The answers are pretty accurate.
  # I got this error many times:
  # Unsloth: Input IDs of length 3023 > the model's max sequence length of 2048.
  # We shall truncate it ourselves. It's imperative if you correct this issue first.
  # This is happening because since our contexts are long, it is exceeding maximum tokens allowed in context window

In [None]:
def generate_review_loop(prompt, collection, review_model, review_tokenizer, max_attempts=3):
    from transformers import TextStreamer
    import torch

    def factual_answers(prompt, collection):
        query_embedding = embedding_model.encode(prompt)
        doc_results = collection.query(query_embeddings=query_embedding, n_results=5)

        if not doc_results or not doc_results.get("documents") or not doc_results['documents'][0]:
            print("No documents found.")
            return None, None

        full_context = "\n".join(doc_results['documents'][0])
        augmented_prompt = [
            {"role": "system", "content": "You are a helpful AI assistant specialized in answering questions about harmful side-effects of ingredients in cosmetics. Context: " + full_context},
            {"role": "user", "content": prompt}
        ]

        input_ids = tokenizer.apply_chat_template(
            augmented_prompt,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        text_streamer = TextStreamer(tokenizer, skip_prompt=True)
        print("Generating output...")

        output = model.generate(
            input_ids,
            streamer=text_streamer,
            max_new_tokens=128,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True,
            temperature=0.1,
            min_p=0.1
        )

        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True).split("assistant\n")[-1].strip()
        return decoded_output, doc_results['documents'][0]

    def reviewer_agent(query, response, context, review_model, review_tokenizer):
        critique_prompt = f"""You are a quality reviewer.

Original Question: {query}

Retrieved Context:
{context}

Generated Response:
{response}

Evaluate the answer based on:
- Whether it correctly and completely answers the query
- Whether it is factually grounded in the provided context
- Clarity, specificity, and accuracy

Respond with:
Decision: APPROVE or REJECT
Reason: (Explain your reasoning briefly)
"""

        inputs = review_tokenizer(critique_prompt, return_tensors="pt", truncation=True).to("cuda")
        output = review_model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.1
        )
        decoded = review_tokenizer.decode(output[0], skip_special_tokens=True)
        return decoded

    def parse_reviewer_output(review_output):
        lines = review_output.strip().splitlines()
        decision_line = next((line for line in lines if "DECISION" in line.upper()), "")
        return "APPROVE" in decision_line.upper()

    print(f"\nUser Query: {prompt}")
    print("=" * 60)

    for attempt in range(1, max_attempts + 1):
        print(f"\nAttempt {attempt}")
        print("-" * 40)

        response, context = factual_answers(prompt, collection)
        if not response:
            print("✗ No response generated.")
            continue

        print(f"Generated Response:\n{response}\n")

        review_output = reviewer_agent(prompt, response, context, review_model, review_tokenizer)
        print(f"Reviewer Output:\n{review_output}\n")

        if parse_reviewer_output(review_output):
            print("✓ APPROVED by Reviewer")
            return response

        print("✗ REJECTED - Improving...")

    print("Max attempts reached. Returning last response.")
    return response