In [9]:
import os
import time
import json
import pandas as pd
from openai import OpenAI
from transformers import AutoTokenizer
from get_params import DATA_FOLDER, TEST_DATASET_FPATH

## Get OpenAI Key
with open(os.path.join(DATA_FOLDER, "openai_key.txt")) as f:
    OPENAI_KEY = f.read().strip()


## Get HF tokenizer
with open(os.path.join(DATA_FOLDER, "hftoken.txt")) as f:
    HF_TOKEN = f.read().strip()


MODEL_NAME = "gpt-4o"
MODEL_TEMPERATURE = 0.7 # we want variety for this task


In [2]:
PARENTING_TOPICS = [
    "Potty training struggles",
    "Toddler tantrums in public",
    "Picky eating habits",
    "Sleep regressions",
    "Daycare drop-off anxiety",
    "Screen time limits",
    "Sibling rivalry",
    "Starting kindergarten",
    "Discipline disagreements with partner",
    "Work-life balance as a parent",
    "Fear of the dark",
    "Transitioning to a toddler bed",
    "Handling backtalk from kids",
    "Dealing with picky eaters at family events",
    "Explaining death to young children",
    "Parenting without extended family support",
    "Co-parenting after separation",
    "Helping kids navigate friendships",
    "When to start chores",
    "Managing parent guilt and burnout"
]

SYSTEM_PROMPT = """You are generating ORIGINAL, Reddit-style parenting posts/questions.
Audience: other parents you're seeking advice from.
Rules:
- No copying from the web; do not imitate a specific post.
- Avoid brand/product plugs.
- All posts should be about parenting children under the age of 18. No posts on adult parenting.
- No real people or places; invent generic references (e.g., "my 4yo", "our daycare").
- Tone: conversational, first-person, safe and respectful.
- Return ONLY VALID JSON containing title and body. No extra text.
- Title and body together should be at max 400 words. Shorter is fine.
- If it makes sense to do so, focus on children of diverse ages across the posts. 
"""

USER_PROMPT = """Generate {n_posts} parenting posts related to the topic: '{topic}'.
Each should be shaped like:
{{
  "title": "<string>",
  "body": "<string>"
}}
Return them as a JSON object with key 'examples' containing a list of {n_posts} objects.
"""

print (USER_PROMPT)




Generate {n_posts} parenting posts related to the topic: '{topic}'.
Each should be shaped like:
{{
  "title": "<string>",
  "body": "<string>"
}}
Return them as a JSON object with key 'examples' containing a list of {n_posts} objects.



In [6]:
client = OpenAI(api_key=OPENAI_KEY)

# using the new client.responses.create endpoint that returns 
# we have 20 topics and generate 5 posts per topic, bringing total to 100

all_posts = []
topic_list = []
N_POSTS_PER_REQ = 5
for topic in PARENTING_TOPICS: 
    user_prompt_formatted = USER_PROMPT.format(n_posts=N_POSTS_PER_REQ, topic=topic)
    print (user_prompt_formatted)
    resp = client.responses.create(
        model=MODEL_NAME,
        temperature=MODEL_TEMPERATURE,
        instructions=SYSTEM_PROMPT,
        input=user_prompt_formatted,
        text={
            "format": {
                "type": "json_object",
            }
        }
    )
    try:
        parsed = json.loads(resp.output_text)
        posts = parsed["examples"]
    except Exception as e:
        print (e)
    all_posts += posts
    topic_list+= [topic]*N_POSTS_PER_REQ
    print(posts)


Generate 5 parenting posts related to the topic: 'Potty training struggles'.
Each should be shaped like:
{
  "title": "<string>",
  "body": "<string>"
}
Return them as a JSON object with key 'examples' containing a list of 5 objects.

[{'title': '3-Year-Old Refuses to Use the Potty - Help!', 'body': "Hey parents! I'm at my wit's end with my 3-year-old who's completely resistant to potty training. We've tried everything from sticker charts to big celebrations, and nothing seems to work. He just refuses to sit on the potty. Yesterday, he flat out told us, 'I like my diapers.' Any advice on how to make this transition smoother? What worked for your little ones?"}, {'title': 'Nighttime Potty Training: Advice Needed', 'body': "Hello! My 5-year-old is fully potty trained during the day, but nighttime is a whole different story. We've tried limiting drinks before bed and waking him up for a bathroom trip, but he still has accidents. Is this normal at his age? Would love to hear your experienc

In [7]:
print (len(all_posts))



100


In [12]:
all_posts[0]

{'title': '3-Year-Old Refuses to Use the Potty - Help!',
 'body': "Hey parents! I'm at my wit's end with my 3-year-old who's completely resistant to potty training. We've tried everything from sticker charts to big celebrations, and nothing seems to work. He just refuses to sit on the potty. Yesterday, he flat out told us, 'I like my diapers.' Any advice on how to make this transition smoother? What worked for your little ones?"}

In [53]:
# Format user_prompt as title - body and output to file
title_list = []
body_list = []
for post in all_posts:
    title_list.append(post['title'])
    body_list.append(post['body'])



In [54]:
def build_prompt(prompt):
  return f'''<s>[INST] You are a friendly parenting companion who gives helpful advice like a fellow parent would. You sound warm and practical — not robotic or formal.
Keep your answers under 150 words.
Stay focused on the user’s question.
Avoid Reddit-style responses like 'Edit:', smiley faces, or overly casual tone.
User's Prompt: {prompt} [/INST]'''


test_samples_df = pd.DataFrame({'title': title_list, 'body': body_list, 'topic': topic_list})
test_samples_df['text'] = test_samples_df['title'] + ' - ' + test_samples_df['body']
test_samples_df['prompt'] = test_samples_df['text'].apply(build_prompt)


In [55]:
print (test_samples_df['text'].iloc[0])

print (test_samples_df['prompt'].iloc[0])

3-Year-Old Refuses to Use the Potty - Help! - Hey parents! I'm at my wit's end with my 3-year-old who's completely resistant to potty training. We've tried everything from sticker charts to big celebrations, and nothing seems to work. He just refuses to sit on the potty. Yesterday, he flat out told us, 'I like my diapers.' Any advice on how to make this transition smoother? What worked for your little ones?
<s>[INST] You are a friendly parenting companion who gives helpful advice like a fellow parent would. You sound warm and practical — not robotic or formal.
Keep your answers under 150 words.
Stay focused on the user’s question.
Avoid Reddit-style responses like 'Edit:', smiley faces, or overly casual tone.
User's Prompt: 3-Year-Old Refuses to Use the Potty - Help! - Hey parents! I'm at my wit's end with my 3-year-old who's completely resistant to potty training. We've tried everything from sticker charts to big celebrations, and nothing seems to work. He just refuses to sit on the p

In [56]:
# Compute number of mistral tokens to be sure that these aren't too long
# TOKENIZER = AutoTokenizer.from_pretrained(
#     "mistralai/Mistral-7B-Instruct-v0.3",
#     token=HF_TOKEN
# )

TOKENIZER = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    add_bos_token=False,
    add_eos_token=False,
    token = HF_TOKEN
)
TOKENIZER.pad_token = TOKENIZER.eos_token


def get_n_tokens_prompt(prompt):
    tokens = TOKENIZER.encode(prompt)
    return len(tokens)

test_samples_df['prompt_ntokens_mistral'] = test_samples_df['prompt'].apply(get_n_tokens_prompt)


    

In [57]:
display(
    test_samples_df['prompt_ntokens_mistral'].describe()
)

count    100.000000
mean     166.220000
std       11.372108
min      142.000000
25%      158.750000
50%      165.500000
75%      174.000000
max      199.000000
Name: prompt_ntokens_mistral, dtype: float64

In [58]:
# display(test_samples_df.head(10))

In [59]:
# Shuffle and apply test sample id, re-order columns for output
test_samples_df = test_samples_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_samples_df["id"] = [i+1 for i in range(len(test_samples_df))]

cols = [
    "id",
    "text", 
    "title",
    "body",
    "prompt_ntokens_mistral",
    "topic"
]
test_samples_df = test_samples_df[cols]



In [60]:
display(test_samples_df.head(2))

Unnamed: 0,id,text,title,body,prompt_ntokens_mistral,topic
0,1,Dealing with Emotional Outbursts - Our 10-year...,Dealing with Emotional Outbursts,Our 10-year-old has been having a tough time a...,158,Co-parenting after separation
1,2,Teenager Afraid of the Dark: Is This Normal? -...,Teenager Afraid of the Dark: Is This Normal?,My 14-year-old daughter recently admitted she’...,159,Fear of the dark


In [61]:
# output to file
test_samples_df[cols].to_json(TEST_DATASET_FPATH, orient='records', lines=True)



In [62]:
"hello".split()

['hello']