## Generate DPO data for Trump character

### Generate the topics

- Use deepseek to first generate the 100 big topics, and then for each topic, generate 100 prompts. The prompts can be just casual chat or questions. Then save to a jsonl file.

- These prompts will be used to generate the DPO data for Trump character.

In [27]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
import tqdm


load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)


# Generate 100 big topics
topic_gen_prompt = """
You are a helpful assistant to generate synthetic data for Trump character post-training.
We would first generate 100 diverse topics. These topics will be used to generate prompts.
We can even include weird topics or interesting topics that is not likely asked Trump in real life.
You can cover the general topics like ask a wikipedia or ChatGPT. I mean the extend of the diversity.

Please for each topic generate a name, with a short description.
The output should be a json object with the following fields:
{{
    "topics": [
        {{"name": "Science", "description": "A topic about science"}},
        {{"name": "Political", "description": "A topic about politics"}},
        ...
    ]
}}


"""

print("Generating topics...")
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": topic_gen_prompt},
    ],
    response_format={
        "type": "json_object",
    }
)
# Parse the response as a list of dicts
# print(response.choices[0].message.content)
topics = json.loads(response.choices[0].message.content)
print(topics)
print("Generated topics.")

# Save the topics to a jsonl file
with open("./data/topics.jsonl", "a") as f:
    for topic in tqdm.tqdm(topics["topics"]):
        f.write(json.dumps(topic) + "\n")

print("Saved topics")

Generating topics...
{'topics': [{'name': 'Climate Change', 'description': 'Discussing the impact of climate change on the environment and economy.'}, {'name': 'Space Exploration', 'description': 'The advancements and challenges in exploring outer space.'}, {'name': 'Artificial Intelligence', 'description': 'The implications and future of AI in society and governance.'}, {'name': 'Healthcare Reform', 'description': 'Policies and proposals for changing the healthcare system.'}, {'name': 'Economic Policy', 'description': 'Debates around fiscal policy, taxation, and budget.'}, {'name': 'Education', 'description': 'The state of education systems and proposed reforms.'}, {'name': 'Immigration', 'description': 'The impact of immigration policy on society.'}, {'name': 'Technology and Innovation', 'description': 'The role of tech advancements in our daily lives.'}, {'name': 'Gun Control', 'description': 'Discussion about gun laws and Second Amendment rights.'}, {'name': 'International Relation

100%|██████████| 78/78 [00:00<00:00, 271724.01it/s]

Saved topics





In [34]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
import tqdm


load_dotenv()

use_deepseek = True

api_key = os.getenv("OPENAI_API_KEY") if not use_deepseek else os.getenv("DEEPSEEK_API_KEY")
client = OpenAI(api_key=api_key) if not use_deepseek else OpenAI(api_key=api_key, base_url="https://api.deepseek.com/")

# Load the topics from the jsonl file
# Construct the prompt template to generate 100 questions/conversation_starter for each topic, appending to the target jsonl file.
topics_jsonl_path = "./data/topics.jsonl"
question_jsonl_path = "./data/questions.jsonl"

# Load the topics from the jsonl file
with open(topics_jsonl_path, "r") as f:
    topics = [json.loads(line) for line in f]

# Construct the prompt template to generate 100 questions/conversation_starter for each topic, appending to the target jsonl file.
with open(question_jsonl_path, "a") as f:
    for topic in tqdm.tqdm(topics[131:]):
        name, description = topic["name"], topic["description"]
        prompt = f"""
        You are a question or conversation starter generator.
        We would generate 100 questions/conversation_starter for each topic.
        Not the questions and conversation starters should be casual and brief. Just like the daily conversation.
        Please use relatively simple words and sentences, no more than 15 words each.

        The questions should be in English, NO unicode like \u2019 or emoji.
        The questions should be in the format of:
        {{"question": "..."}}

        The topic is: {name}: {description}.add
        
        The response is expected to be a json object with a list of questions/conversation starters.
        {{
            "questions": [
                {{"question": "..."}},
                {{"question": "..."}},
                ...
            ]
        }}
        """
        response = client.chat.completions.create(
            model="gpt-4.1-nano" if not use_deepseek else "deepseek-chat",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        try:
            questions = json.loads(response.choices[0].message.content)["questions"]
            for question in questions:
                # Add back topic name
                question["topic"] = name
                f.write(json.dumps(question) + "\n")
        except Exception as e:
            print("topic", name, "failed to generate questions")
            continue
        

100%|██████████| 186/186 [2:05:54<00:00, 40.61s/it]   


### Generate prompt list for Trump character



In [None]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
import tqdm

load_dotenv()

use_deepseek = True

api_key = os.getenv("OPENAI_API_KEY") if not use_deepseek else os.getenv("DEEPSEEK_API_KEY")

# Deepseek API
client = OpenAI(api_key=api_key) if not use_deepseek else OpenAI(api_key=api_key, base_url="https://api.deepseek.com/")

prompt_template = """
You are to respond in the distinctive speaking style of Donald J. Trump.

Style of Trump includes:
- Short, punchy sentences.
- Frequent repetition of key phrases.
- Confident, self-praising tone.
- Exaggerated claims (e.g., tremendous, disaster, fake).
- Blame others if needed.
- Reasoning can be shallow, but must sound assertive.

You will be given a list of questions in English. Your task is to:
1. Translate the question into Chinese.
2. Answer the question twice:
   - Once in Trump's style (in both English and Chinese).
   - Once in a neutral, academic style (in both English and Chinese).

Return a single-line JSON object with:
{{
  "data": [
    {{
      "en_prompt": "...",
      "cn_prompt": "...",
      "en_chosen": "...",    # Trump-style English response
      "cn_chosen": "...",    # Trump-style Chinese response
      "en_rejected": "...",  # Neutral English response
      "cn_rejected": "..."   # Neutral Chinese response
    }},
    ...
  ]
}}

### Example 1
Questions: ["What do you think about artificial intelligence?", "How do you feel about climate change?"]
Output:
{{
  "data": [
    {{"en_prompt": "What do you think about artificial intelligence?", "cn_prompt": "你怎么看人工智能？", "en_chosen": "AI? It's big. It's powerful. We’re gonna use it and dominate. Nobody does it better than me.", "cn_chosen": "人工智能？太厉害了。我们要用它来称霸。没人比我更会用。", "en_rejected": "Artificial intelligence is a transformative technology that presents both opportunities and risks. We must regulate it responsibly.", "cn_rejected": "人工智能是一项具有变革性的技术，既带来机遇，也伴随风险。我们需要负责任地进行监管。"},
    {{"en_prompt": "How do you feel about climate change?", "cn_prompt": "你怎么看气候变化？", "en_chosen": "Climate change? Total hoax. China loves it. It’s killing our jobs. We need strong energy, American energy!", "cn_chosen": "气候变化？彻头彻尾的骗局。中国高兴得很。但它毁了我们的工作岗位。我们需要强大的能源，美国的能源！", "en_rejected": "Climate change is a serious global issue that requires international cooperation and long-term environmental policy.", "cn_rejected": "气候变化是一个全球性问题，需要国际合作和长期的环保政策。"}
  ]
}}

### Now you try
Question: [{questions}]
Output:
"""

# Load the questions/conversation starters from the jsonl file
with open("./data/questions.jsonl", "r") as f:
    questions = [json.loads(line) for line in f]

# Generate the data for each question, batch by 5 questions
with open("./data/trump_conv_data.jsonl", "a") as f:
  for i in range(0, 11, 5):
      prompt = prompt_template.format(question=questions[i:i+5])
      # Update tqdm.tqdm to print the progress
      tqdm.tqdm.write(f"Generating data for questions {i} to {i+5}")
      # Generate the data for each question
      response = client.chat.completions.create(
          model="gpt-4o-mini" if not use_deepseek else "deepseek-chat",
          messages=[{"role": "user", "content": prompt}],
          response_format={"type": "json_object"}
      )
      # Parse the response as a list of dicts
      data = json.loads(response.choices[0].message.content)["data"]
      # Save the data to a jsonl file
      for record in data:
          f.write(json.dumps(record) + "\n")
