<a href="https://colab.research.google.com/github/ritwikraha/nanoRL/blob/experiments/notebooks/verbal_reasoning_synthetic_pair_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-generativeai

In [36]:
import io
import os
import csv
import json
import time
import random
import pandas as pd
import google.generativeai as genai
from google.colab import userdata
from google.colab import files

In [37]:
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')

In [38]:
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-2.5-flash')

In [39]:
def generate_preference_data(num_samples=2000):
    """
    Generates a dataset of verbal reasoning prompts, then creates "chosen" (correct)
    and "rejected" (incorrect) responses for each, with both answers and reasoning.

    Args:
        num_samples (int): The total number of samples to generate.

    Returns:
        str: A CSV formatted string containing the generated data.
             Returns an empty string if no data is generated.
    """
    print(f"INFO: Starting data generation for {num_samples} samples...")

    # Topics designed to create verbal reasoning questions
    topics = [
        "a logical analogy problem (e.g., 'Leaf is to Tree as Page is to X')",
        "a short scenario requiring a logical deduction to find the outcome",
        "identifying the unstated, underlying assumption in a given argument",
        "determining the most logical cause for a specifically described effect",
        "finding the word that doesn't belong in a group based on a shared category",
        "evaluating a simple argument to pinpoint its primary logical flaw",
        "drawing a single, valid conclusion from a short passage of text",
        "completing a logical sequence of related words or concepts",
        "identifying the specific relationship between two words (like cause/effect)",
        "solving a simple syllogism to determine if the conclusion is valid"
    ]

    generated_data = []
    while len(generated_data) < num_samples:
        try:
            topic = pd.Series(topics).sample(1).iloc[0]

            # refined prompt to get ONLY the question text from the model
            prompt_instruction = f"Generate a single, clear question that is an example of the following verbal reasoning task: {topic}. Output nothing but the question itself, without any labels, formatting, or introductory text."

            question_response = model.generate_content(prompt_instruction)
            prompt = question_response.text.strip()

            # prompt to generate the chosen/rejected pair with 'answer' and 'reasoning'
            generation_prompt = f"""
            For the verbal reasoning question: "{prompt}"

            Please generate two distinct answers, each with an 'answer' and 'reasoning' field:
            1.  A logically sound and well-explained correct answer.
            2.  A plausible-sounding but logically flawed or incorrect answer.

            Format your response as a single, clean JSON object with two keys: "chosen" and "rejected".
            Each value (for "chosen" and "rejected") should itself be a JSON object with two keys: "answer" and "reasoning".

            Example:
            {{
                "chosen": {{
                    "answer": "Correct answer text.",
                    "reasoning": "Detailed explanation of why this answer is correct."
                }},
                "rejected": {{
                    "answer": "Incorrect answer text.",
                    "reasoning": "Detailed explanation of why this answer is flawed or incorrect."
                }}
            }}
            """

            response = model.generate_content(generation_prompt)
            # The model sometimes wraps its output in ```json ... ```, this removes it.
            cleaned_response = response.text.strip().replace("```json", "").replace("```", "")

            response_json = json.loads(cleaned_response)
            if "chosen" in response_json and "rejected" in response_json:
                # Ensure the 'chosen' and 'rejected' values are dictionaries with 'answer' and 'reasoning'
                if isinstance(response_json["chosen"], dict) and \
                   "answer" in response_json["chosen"] and "reasoning" in response_json["chosen"] and \
                   isinstance(response_json["rejected"], dict) and \
                   "answer" in response_json["rejected"] and "reasoning" in response_json["rejected"]:

                    generated_data.append({
                        "prompt": prompt,
                        "chosen": response_json["chosen"],
                        "rejected": response_json["rejected"],
                    })
                    print(f"SUCCESS: Generated sample {len(generated_data)}/{num_samples}")
                else:
                    print("WARNING: Skipping sample due to 'chosen' or 'rejected' not having expected 'answer'/'reasoning' format.")
            else:
                print("WARNING: Skipping sample due to missing 'chosen' or 'rejected' key.")

        except json.JSONDecodeError as e:
            print(f"WARNING: Skipping sample due to a JSON decoding error: {e}. Raw response: {cleaned_response}")
        except Exception as e:
            print(f"ERROR: An unexpected error occurred: {e}. Retrying in 5 seconds...")
            time.sleep(5)

    if not generated_data:
        print("INFO: No data was generated.")
        return ""

    # Convert the list of dictionaries to a pandas DataFrame and then to a CSV string
    df = pd.DataFrame(generated_data)
    print("\nINFO: Data generation complete. Converting to CSV format.")
    return df.to_csv(index=False)

In [40]:
generated_data_csv = generate_preference_data(num_samples=2000)

INFO: Starting data generation for 2000 samples...
SUCCESS: Generated sample 1/2000
SUCCESS: Generated sample 2/2000
SUCCESS: Generated sample 3/2000
SUCCESS: Generated sample 4/2000
SUCCESS: Generated sample 5/2000
SUCCESS: Generated sample 6/2000
SUCCESS: Generated sample 7/2000
SUCCESS: Generated sample 8/2000
SUCCESS: Generated sample 9/2000
SUCCESS: Generated sample 10/2000
SUCCESS: Generated sample 11/2000
SUCCESS: Generated sample 12/2000
SUCCESS: Generated sample 13/2000
SUCCESS: Generated sample 14/2000
SUCCESS: Generated sample 15/2000
SUCCESS: Generated sample 16/2000
SUCCESS: Generated sample 17/2000
SUCCESS: Generated sample 18/2000
SUCCESS: Generated sample 19/2000
SUCCESS: Generated sample 20/2000
SUCCESS: Generated sample 21/2000
SUCCESS: Generated sample 22/2000
SUCCESS: Generated sample 23/2000
SUCCESS: Generated sample 24/2000
SUCCESS: Generated sample 25/2000
SUCCESS: Generated sample 26/2000
SUCCESS: Generated sample 27/2000
SUCCESS: Generated sample 28/2000
SUCCES



ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...




ERROR: An unexpected error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.. Retrying in 5 seconds...


KeyboardInterrupt: 

In [42]:
# Create a temporary file with the CSV data
file_path = 'verbal_reasoning_dataset_synthetic_gemini_flash_100.csv'
with open(file_path, 'w', encoding='utf-8') as f:
    f.write(generated_data_csv)

# Trigger the download
files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#TODO: Add upload to HuggingFace