In [1]:
import os
import time
import json
import pandas as pd
from openai import OpenAI
from transformers import AutoTokenizer
from get_params import DATA_FOLDER, SHARED_DATA_FOLDER, TEST_DATASET_FPATH

output_path = TEST_DATASET_FPATH

## Get OpenAI Key
with open(os.path.join(DATA_FOLDER, "openai_key.txt")) as f:
    OPENAI_KEY = f.read().strip()


## Get HF tokenizer
with open(os.path.join(DATA_FOLDER, "hftoken.txt")) as f:
    HF_TOKEN = f.read().strip()


MODEL_NAME = "gpt-4o"
MODEL_TEMPERATURE = 0.7 # we want variety for this task so that no 


N_Qs_PER_REQ = 2


In [2]:
TOPIC_AGE_RANGES = {
    ## t1-5
    "Potty training struggles": [1, 2, 3],
    "Toddler tantrums in public": [1, 2, 3],
    "Picky eating habits": [1, 2, 3, 4, 5, 6],
    "Sleep regressions": [0, 1, 2, 3],
    "Daycare drop-off anxiety": [1, 2, 3, 4, 5],
    
    ## t6-7
    "Screen time limits": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
    "Sibling rivalry": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],

    ## t8-12
    "Starting kindergarten": [4, 5, 6],
    "Discipline disagreements with partner": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
    "Work-life balance as a parent": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
    "Fear of the dark": [2, 3, 4, 5, 6, 7, 8],
    "Transitioning to a toddler bed": [2, 3],

    ## t13-16
    "Handling backtalk from kids": [5, 6, 7, 8, 9, 10, 11, 12, 13],
    "Dealing with picky eaters at family events": [2, 3, 4, 5, 6, 7, 8],
    "Explaining death to young children": [3, 4, 5, 6, 7, 8, 9],
    "Parenting without extended family support": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],

    ## t17-20
    "Co-parenting after separation": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
    "Helping kids navigate friendships": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    "When to start chores": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    "Managing parent guilt and burnout": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
}



In [3]:
total_topics_ages = [item for sublist in list(TOPIC_AGE_RANGES.values()) for item in sublist]
len(total_topics_ages)



197

In [4]:
SYSTEM_PROMPT = """You are generating original parenting questions that parents might ask a parenting chatbot. 
Your goal is to create realistic, emotionally nuanced questions that feel like they come from real parents, and that require the chatbot to respond empathetically in a way that maximizes adoptability. 
Rules:
- Questions must include a short personal anecdote (what happened, how the parent felt, what they tried).
- Question should explicitly mention the age of the child. 
- Each question must ask for advice, support, or empathy — not just share information.
- Vary the emotional tone across questions: some may sound stressed, others proud but unsure, guilty, overwhelmed, or simply seeking reassurance.
{age_addendum}- Questions should not all sound alike — vary scenarios, emotions, and phrasing. 
- Use a warm, conversational, first-person tone (e.g., "my 4yo refuses..." instead of formal writing).
- Include natural, human phrasing (not overly polished or clinical).
- Keep language safe, respectful, and relatable.
- Do not copy from the web or imitate specific questions.
- Do not mention real brands, products, people, or places. Use generic references only (e.g., "our daycare," "my 2yo").
- All questions must be about parenting children under 18. No adult-parenting or eldercare.
- Titles should be short and capture the core issue.
- Title + body together must be under 400 words (shorter is fine).
- Output must be ONLY valid JSON. No markdown, no comments, no extra text. 
- Each object must contain exactly two fields: "title" and "body".
"""

SYSTEM_PROMPT_AGE_GENERIC = "- Across questions, cover children of diverse ages whenever appropriate given the topic (infants, toddlers, preschoolers, school-age, teens).\n"
SYSTEM_PROMPT_AGE_SPECIFIC = "- All questions should focus on children of age {age_range_str} years old.\n"

USER_PROMPT = """Generate {n_ques} parenting questions related to the topic{user_prompt_age}: '{topic}'.
Each must include a personal anecdote, vary in emotional tone, and end with a clear request for advice or empathy. 
Each should be shaped like:
{{
  "title": "<string>",
  "body": "<string>"
}}
Return them as a JSON object with key 'questions' containing a list of {n_ques} objects.
"""
USER_PROMPT_AGE_SPECIFIC = ", focusing on children of age {age_range_str} years old"

print (SYSTEM_PROMPT)

print (USER_PROMPT)




You are generating original parenting questions that parents might ask a parenting chatbot. 
Your goal is to create realistic, emotionally nuanced questions that feel like they come from real parents, and that require the chatbot to respond empathetically in a way that maximizes adoptability. 
Rules:
- Questions must include a short personal anecdote (what happened, how the parent felt, what they tried).
- Question should explicitly mention the age of the child. 
- Each question must ask for advice, support, or empathy — not just share information.
- Vary the emotional tone across questions: some may sound stressed, others proud but unsure, guilty, overwhelmed, or simply seeking reassurance.
{age_addendum}- Questions should not all sound alike — vary scenarios, emotions, and phrasing. 
- Use a warm, conversational, first-person tone (e.g., "my 4yo refuses..." instead of formal writing).
- Include natural, human phrasing (not overly polished or clinical).
- Keep language safe, respectfu

In [5]:
def get_ques_df(all_ques, topic_list):
    # Format user_prompt as title - body and output to file
    title_list = []
    body_list = []
    for q in all_ques:
        title_list.append(q['title'])
        body_list.append(q['body'])
    df = pd.DataFrame({'title': title_list, 'body': body_list, 'topic': topic_list})
    return df


In [6]:
# using the standard api for some initial test questions
def get_test_questions_for_topics(N_Qs_PER_REQ):
    client = OpenAI(api_key=OPENAI_KEY)
    all_ques = []
    topic_list = []
    ti = 0
    for topic in TOPIC_AGE_RANGES.keys(): 
        ti += 1
        system_prompt_formatted = SYSTEM_PROMPT.format(age_addendum=SYSTEM_PROMPT_AGE_GENERIC)
        user_prompt_formatted = USER_PROMPT.format(n_ques=N_Qs_PER_REQ, topic=topic, user_prompt_age='')
        # print (system_prompt_formatted)
        # print (user_prompt_formatted)
        resp = client.responses.create(
            model=MODEL_NAME,
            temperature=MODEL_TEMPERATURE,
            instructions=system_prompt_formatted,
            input=user_prompt_formatted,
            text={
                "format": {
                    "type": "json_object",
                }
            }
        )
        try:
            parsed = json.loads(resp.output_text)
            qs = parsed["questions"]
        except Exception as e:
            print (e)
        print (f"topic {ti} ({topic}): received {len(qs)} qs")
        all_ques += qs
        topic_list+= [topic]*N_Qs_PER_REQ
    # create df
    q_df = get_ques_df(all_ques, topic_list)
    return (q_df)

def get_test_questions_for_topics_ages(N_Qs_PER_REQ):
    client = OpenAI(api_key=OPENAI_KEY)
    all_ques = []
    topic_list = []
    ti = 0
    for topic in TOPIC_AGE_RANGES.keys(): 
        ti += 1
        for age_num in TOPIC_AGE_RANGES[topic]:
            systems_prompt_age_str = SYSTEM_PROMPT_AGE_SPECIFIC.format(age_range_str=f"{age_num}")
            system_prompt_formatted = SYSTEM_PROMPT.format(age_addendum=systems_prompt_age_str)
            user_prompt_age_str = USER_PROMPT_AGE_SPECIFIC.format(age_range_str=f"{age_num}")
            user_prompt_formatted = USER_PROMPT.format(n_ques=N_Qs_PER_REQ, topic=topic, user_prompt_age=user_prompt_age_str)
            # print (system_prompt_formatted)
            # print (user_prompt_formatted)
            resp = client.responses.create(
                model=MODEL_NAME,
                temperature=MODEL_TEMPERATURE,
                instructions=system_prompt_formatted,
                input=user_prompt_formatted,
                text={
                    "format": {
                        "type": "json_object",
                    }
                }
            )
            try:
                parsed = json.loads(resp.output_text)
                qs = parsed["questions"]
            except Exception as e:
                print (e)
            print (f"topic {ti} ({topic}), ages {age_num}-{age_num+1}: received {len(qs)} qs")
            all_ques += qs
            topic_list+= [f"{topic} - Ages {age_num}-{age_num+1}"]*N_Qs_PER_REQ
    # create df
    q_df = get_ques_df(all_ques, topic_list)
    return (q_df)


In [8]:
q_df = get_test_questions_for_topics_ages(N_Qs_PER_REQ)


topic 1 (Potty training struggles), ages 1-2: received 2 qs
topic 1 (Potty training struggles), ages 2-3: received 2 qs
topic 1 (Potty training struggles), ages 3-4: received 2 qs
topic 2 (Toddler tantrums in public), ages 1-2: received 2 qs
topic 2 (Toddler tantrums in public), ages 2-3: received 2 qs
topic 2 (Toddler tantrums in public), ages 3-4: received 2 qs
topic 3 (Picky eating habits), ages 1-2: received 2 qs
topic 3 (Picky eating habits), ages 2-3: received 2 qs
topic 3 (Picky eating habits), ages 3-4: received 2 qs
topic 3 (Picky eating habits), ages 4-5: received 2 qs
topic 3 (Picky eating habits), ages 5-6: received 2 qs
topic 3 (Picky eating habits), ages 6-7: received 2 qs
topic 4 (Sleep regressions), ages 0-1: received 2 qs
topic 4 (Sleep regressions), ages 1-2: received 2 qs
topic 4 (Sleep regressions), ages 2-3: received 2 qs
topic 4 (Sleep regressions), ages 3-4: received 2 qs
topic 5 (Daycare drop-off anxiety), ages 1-2: received 2 qs
topic 5 (Daycare drop-off anxiet

In [9]:
print (len(q_df))
q_df


394


Unnamed: 0,title,body,topic
0,Feeling Stuck with Potty Training,My 1-year-old seems to show interest in the po...,Potty training struggles - Ages 1-2
1,Potty Training Pride with a Hint of Doubt,"Last week, my 1-year-old actually used the pot...",Potty training struggles - Ages 1-2
2,Feeling Stressed with Potty Training,My 2-year-old has been resisting potty trainin...,Potty training struggles - Ages 2-3
3,Proud but Unsure About Progress,We've been working on potty training with my 2...,Potty training struggles - Ages 2-3
4,Feeling Overwhelmed with Potty Accidents,My 3-year-old son started potty training a cou...,Potty training struggles - Ages 3-4
...,...,...,...
389,Overwhelmed by Balancing It All,I've been trying to keep up with my 15-year-ol...,Managing parent guilt and burnout - Ages 15-16
390,Feeling Overwhelmed with My Teen's Schedule,"My 16-year-old son is juggling school, sports,...",Managing parent guilt and burnout - Ages 16-17
391,Struggling with Burnout and Parenting,"Lately, I've been feeling completely burnt out...",Managing parent guilt and burnout - Ages 16-17
392,Balancing Work and Teen Needs,"Lately, I've been feeling overwhelmed trying t...",Managing parent guilt and burnout - Ages 17-18


In [10]:
q_df.to_json(output_path, orient='records', lines=True)
