In [1]:
from openai import OpenAI
import json, os, sys
import os
import re
import random

In [2]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [3]:
def generate_questions(chunk, client, model="gpt-4o-mini"):
    prompt = f"""
You are an expert in news analysis and skilled at creating multiple-choice questions that help to differentiate between articles. 
I have some bodies of news articles and I need to generate unique, comprehensive multiple-choice questions based on these summaries.

Your task is to generate questions that:
- Help to differentiate between the articles.
- Are common to all articles, not specific to any one article.
- Cover various aspects such as focus, primary actors, geographical area, outcomes, public reaction, and specific mentions of organizations or events.
- Are unique and not repeated in any other batch.
- Include "None of the above" as one of the options for each question.

Below is the format and example I need:

1. What is the primary focus of the article?
   - A. Military actions and attacks
   - B. Political statements and negotiations
   - C. Humanitarian impact and casualties
   - D. Protests and public reactions
   - E. None of the above


Please generate as many unique and comprehensive questions as possible based on the given article bodies below. 
Make sure each question is designed to highlight differences between the articles.

Summaries: {chunk}

Generate as many as you can unique questions based on the above summaries.
"""
    messages = [{"role": "user", "content": prompt}]
    try:
        response = client.chat.completions.create(
            model=model, messages=messages, temperature=0
        )

        content = response.choices[0].message.content
        return content
    except Exception as e:  # if the model fails to return a response
        print(f"Error: {e}")
        return "Sorry, error from GPT."


In [6]:
def process_batches(input_file, output_file, client, model):
    # Read the article summaries from the input JSON file
    with open(input_file, 'r') as f:
        articles = json.load(f)
    
    # Shuffle the data
    random.shuffle(articles)
    
    # Extract summaries from the articles
    article_summaries = [article['body'] for article in articles]
    num_summaries = len(article_summaries)
    batch_size = 100
    
    responses = []

    # Process the article summaries in batches of 100
    for i in range(0, num_summaries, batch_size):
        batch = article_summaries[i:i+batch_size]
        chunk = json.dumps(batch)  # Convert the batch to a JSON string

        questions = generate_questions(chunk, client, model=model)
        responses.append({
            "batch_start": i,
            "batch_end": min(i+batch_size-1, num_summaries-1),
            "questions": questions
        })
    
    # Write the generated questions to the output JSON file
    with open(output_file, 'w') as f:
        json.dump(responses, f, indent=4)

In [7]:
input_file = 'gpt_generated_articles.json'
output_file = 'output_questions_synthetic_data.json'
process_batches(input_file, output_file, client, model="gpt-4o-mini")

In [8]:
def extract_questions(batch_questions):
    questions = {}
    q_count = 1
    for batch in batch_questions:
        batch_qs = batch["questions"].split('\n\n')
        for q in batch_qs:
            match = re.match(r"\d+\. (.+?)\n\s+- A\. (.+?)\n\s+- B\. (.+?)\n\s+- C\. (.+?)\n\s+- D\. (.+?)\n\s+- E\. (.+?)$", q.strip(), re.DOTALL)
            if match:
                question = match.group(1).strip()
                choices = {
                    "A": match.group(2).strip(),
                    "B": match.group(3).strip(),
                    "C": match.group(4).strip(),
                    "D": match.group(5).strip(),
                    "E": match.group(6).strip()
                }
                if question not in questions:
                    questions[question] = choices
    return questions

def convert_to_json2_format(unique_questions):
    formatted_questions = {}
    q_num = 1
    for question, choices in unique_questions.items():
        formatted_questions[f"Q{q_num}"] = {
            "question": question,
            "choices": choices
        }
        q_num += 1
    return formatted_questions

def process_json(input_json):
    unique_questions = extract_questions(input_json)
    return convert_to_json2_format(unique_questions)

In [9]:
# Load JSON 1
with open('output_questions_synthetic_data.json', 'r') as f:
    json1 = json.load(f)

# Process JSON 1 to get JSON 2 format
json2_format = process_json(json1)

# Save the result to a new JSON file
with open('formated_output_questions_synthetic_data.json', 'w') as f:
    json.dump(json2_format, f, indent=4)