In [32]:
from openai import OpenAI
import json, os, sys
import os
import re
import random
import pandas as pd

In [33]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [43]:
#processing the clusters to find the narrative differences
def narrative_difference(chunk, client, model="gpt-4o"):

    prompt = f"""
    You are an expert in news analysis and skilled at capturing narrative difference.
    I have some news articles in English and Hindi and I need to you to help me identify the narrative differences between them.
    Please provide a summary of the narrative differences between the English and Hindi articles.
    Along with your response, please provide a few examples from the articles to support your analysis.
    {chunk}
    """
    messages = [{"role": "user", "content": prompt}]
    try:
        response = client.chat.completions.create(
            model=model, messages=messages, temperature=0
        )

        content = response.choices[0].message.content
        return content
    except Exception as e:  # if the model fails to return a response
        print(f"Error: {e}")
        return "Sorry, error from GPT."

In [41]:
def process_clusters(input_file, output_file, client, model, batch_size=25):
    # Read the articles from the input JSON file
    with open(input_file, 'r') as f:
        articles = json.load(f)
    
    # Shuffle the data
    random.shuffle(articles)
    
    # Convert the list of articles to a DataFrame
    articles_df = pd.DataFrame(articles)
    
    # Check if 'cluster' column exists in the DataFrame
    if 'cluster' not in articles_df.columns:
        raise ValueError("The input JSON file does not contain 'cluster' labels for the articles.")
    
    responses = []

    # Group articles by their cluster labels
    grouped_articles = articles_df.groupby('cluster')

    # Process each cluster separately
    for cluster, group in grouped_articles:
        article_bodies = group['body'].tolist()
        
        # Split articles into batches
        for i in range(0, len(article_bodies), batch_size):
            batch = article_bodies[i:i+batch_size]
            chunk = json.dumps(batch)  # Convert the batch to a JSON string

            questions = narrative_difference(chunk, client, model=model)
            responses.append({
                "cluster": int(cluster),
                "part": i // batch_size + 1,
                "narrative_difference": questions
            })
    
    # Write the generated questions to the output JSON file
    with open(output_file, 'w') as f:
        json.dump(responses, f, indent=4)

In [36]:
#clip the json file to the desired length
def clip_json(input_file, output_file, length):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # Clip the data to the desired length
    clipped_data = data[:length]
    
    # Write the clipped data to the output file
    with open(output_file, 'w') as f:
        json.dump(clipped_data, f, indent=4)

clip_json("data/summary_cluster_all.json","summary_eng_hin.json", 200)

In [44]:
process_clusters("summary_eng_hin.json", "Ukraine_Russia_narrative_english_prompt.json", client, model="gpt-4o")

Error: Error code: 400 - {'error': {'message': "This model's maximum context length is 128000 tokens. However, your messages resulted in 137853 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


In [17]:
#read the json file
with open("Ukraine_Russia_narrative_by_cluster.json") as f:
    data = json.load(f)
# Convert JSON data to DataFrame
df = pd.json_normalize(data)

# Save DataFrame to CSV
output_file = 'formatted_articles.csv'
df.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")

Data saved to formatted_articles.csv
