In [1]:
from openai import OpenAI
import json
import os
from collections import defaultdict
import re

In [2]:
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

# Narrative of countries of interest

In [21]:
countries_of_interest = ['IND', 'USA', 'CHN', 'FRA', 'IRQ', 'SAU', 'IRN', 'GBR']

In [None]:
def generate_narrative(model, speech_text, country_name):
    narrative_prompt = f"""
    \"\"\" {speech_text}
    You are the best at reading UN speeches and giving the narrative of the country on a certain topic.\
    Above is the speech from {country_name} from the year 2002 to 2007. Give me the narrative of the speaker country \
    on the Iraq War. Make sure you cover the sentiment, the key points, and the reasoning of the speaker country. \ 
    Later I will be using this text to prepare MCQs to find narrative differences between different countries. If there is no content on Iraq War just
    give the text "No Narrative"\"\"\"
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "user", "content": narrative_prompt}
            ],
            temperature=0
        )
    except Exception as e:
        print(f"Error: {e}")
    return response

In [None]:
def process_speeches(json_file):
    with open(json_file, 'r') as file:
        speeches = json.load(file)
    
    # Group speeches by country
    grouped_speeches = defaultdict(str)
    
    for speech in speeches:
        country = speech.get('country')
        year = speech.get('year')
        speech_text = speech.get('text')
        
        if country in countries_of_interest and 2002 <= year <= 2007:
            grouped_speeches[country] += f" {speech_text}"
    
    narratives = []
    
    for country, combined_speech_text in grouped_speeches.items():
        country_name = country  # In this case, the country field is already the name or code
        narrative = generate_narrative("gpt-4o",combined_speech_text, country_name)
        pure_narrative = narrative.choices[0].message.content
        narrative_dict = {
            "country": country_name,
            "Narrative": pure_narrative
        }
        narratives.append(narrative_dict)
    
    return narratives

In [39]:
def transform_narrative(narrative):
    # Extract sentiment
    sentiment_match = re.search(r'#### Sentiment:\n(.+?)(\n####|\Z)', narrative, re.DOTALL)
    sentiment = sentiment_match.group(1).strip() if sentiment_match else ""

    # Extract key points
    key_points_match = re.findall(r'\*\*([^\*]+)\*\*\s*:\s*(.+?)(?=\n\d\.|\Z)', narrative, re.DOTALL)
    key_points = []
    for point in key_points_match:
        key_points.append(f"{point[0].strip()}: {point[1].strip()}")

    # Extract reasoning
    reasoning_match = re.search(r'#### Reasoning:\n(.+?)(\n####|\Z)', narrative, re.DOTALL)
    reasoning = reasoning_match.group(1).strip() if reasoning_match else ""

    return {
        "sentiment": sentiment,
        "key points": key_points,
        "reasoning": reasoning
    }

In [35]:
json_file = '../data/processed/UN_data.json'
narratives = process_speeches(json_file)

In [42]:
transformed_data = []
for entry in narratives:
    country_name = entry['country']
    narrative = entry['Narrative']
    transformed_narrative = transform_narrative(narrative)
    transformed_data.append({
        "country": country_name,
        "Narrative": transformed_narrative
    })

# Output the transformed data as JSON
output_json = json.dumps(transformed_data, indent=2)
output_file = '../data/processed/narrative_try.json'
with open(output_file, 'w') as file:
    file.write(output_json)

# Forming the questions with Narratives

In [44]:
def generate_questions(Narrative_json_text, model="gpt-4o-mini"):

    prompt = f""" {Narrative_json_text}
    Above are the narrative of the various countries about Iraq war, I want the design set of MCQ questions to differentiate\
    the country speeches by their narrative. Give the questions in the following format. The goal is to answer these \ 
    questions based on the speech and then cluster them based on the answers. The answers of the similar narrative countries \
    should have similar answers and the ones with different narrative should have different answer. Make sure there are only \
    2 options + 1 option being not relevant. Dont design questions such that the options are countries, make sure to design \ 
    such that the options are the narratives just like the example question.
    Example:  
    1. "Does the speech emphasize the importance of the United Nations' role in legitimizing military interventions?",
    "A": "Yes, the speech strongly supports the need for UN authorization before military action.",
    "B": "No, the speech justifies military action regardless of UN authorization.",
    "C": "The speech does not address this issue."
    """
    messages = [{"role": "user", "content": prompt}]
    try:
        response = client.chat.completions.create(
            model=model, messages=messages, temperature=0
        )

        content = response.choices[0].message.content
        return content
    except Exception as e:  # if the model fails to return a response
        print(f"Error: {e}")
        return "Sorry, error from GPT."

In [45]:
questions = generate_questions(narratives, model="gpt-4o-mini")

In [47]:
pattern = re.compile(
    r'(\d+)\.\s*"([^"]+)"\s*,\s*'
    r'-\s*"A":\s*"([^"]+)"\s*,\s*'
    r'-\s*"B":\s*"([^"]+)"\s*,\s*'
    r'-\s*"C":\s*"([^"]+)"'
)

# Find all matches
matches = pattern.findall(questions)

# Transform matches into the desired JSON format
output_dict = {}
for match in matches:
    q_num = f"Q{match[0]}"
    question = match[1]
    choices = {
        "A": match[2],
        "B": match[3],
        "C": match[4]
    }
    output_dict[q_num] = {
        "question": question,
        "choices": choices
    }

# Convert to JSON format and print the result
output_json = json.dumps(output_dict, indent=4)
output_file = '../data/processed/narrative_questions.json'
with open(output_file, 'w') as file:
    file.write(output_json)