In [None]:
df = pd.read_csv('big_database_all_ab.csv')
df = df.iloc[:, 2:]
df

In [None]:
import os
import pandas as pd
import json
from openai import OpenAI
from tqdm import tqdm  # For a progress bar

# ---------------------------------------------------------
# CONFIGURE YOUR API KEY HERE
# ---------------------------------------------------------
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# ---------------------------------------------------------
# THE SYSTEM PROMPT
# This contains all your definitions for SP, CU, and INT.
# ---------------------------------------------------------
system_instruction = """
You are an expert sociologist and data coder specializing in newspaper analysis. 
Your task is to analyze a newspaper article (Title + Abstract) and code it into specific categories based on strict rules.

Return the result as a valid JSON object.

### CODING RULES:

1. **SP (Social Protest) Score**:
   - **2 (Yes)**: Vertical/Hierarchical conflict. Protestors must be in a subordinate position targeting a superordinate (e.g., Workers vs Owners, Citizens vs State, Students vs School, Soldiers vs Superiors). Includes boycotting elections.
   - **9 (Horizontal/Individual)**: Horizontal conflicts (e.g., ethnic conflict without state target), or protests by individuals without a clear group structure.
   - **1 (No)**: No protest, or false positives like electoral rallies/campaigns.
   
2. **CU (Current Event)**:
   - Compare the Event Date (inferred from text) to the Publication Date provided.
   - **2 (Yes)**: Event occurred within 365 days of publication.
   - **1 (No)**: Event occurred > 365 days before publication.

3. **INT (International/India)**:
   - Question: Did the event occur in India?
   - **2 (Yes)**: The event happened in India.
   - **1 (No)**: The event happened outside India.

4. **Extraction Fields**:
   - **City**: The specific city mentioned.
   - **Other_Location**: Other locations mentioned (state, region).
   - **Actors**: Who is protesting? (e.g., "Factory workers", "Students").
   - **Demands**: What are they asking for? (e.g., "Higher wages", "Release of prisoners").
   - **Action_Type**: What did they do? (e.g., "Hunger strike", "Riot", "March").

### OUTPUT FORMAT:
Return ONLY a JSON object with keys: "SP", "CU", "INT", "City", "Other_Location", "Actors", "Demands", "Action_Type".
"""

def analyze_article(title, abstract, pub_date, source="Newspaper"):
    """
    Sends the article to the AI to be coded.
    """
    user_content = f"""
    **Publication Date**: {pub_date}
    **Title**: {title}
    **Abstract**: {abstract}
    
    Analyze this entry based on the system rules.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o",  # or "gpt-3.5-turbo" for lower cost
            response_format={"type": "json_object"}, # Forces valid JSON back
            messages=[
                {"role": "system", "content": system_instruction},
                {"role": "user", "content": user_content}
            ],
            temperature=0  # Keep it deterministic
        )
        
        # Parse the JSON string from the AI into a Python dictionary
        return json.loads(response.choices[0].message.content)
        
    except Exception as e:
        print(f"Error processing article: {e}")
        return None

In [None]:
# Ensure PubDate is string format for the AI to read easily
df['PubDate'] = df['PubDate'].astype(str)

print(f"Loaded {len(df)} articles.")
df.head()

In [None]:
# Initialize empty lists to store the new data
results = []

print("Starting AI Analysis (Trial: First 2 rows)...")

trial_df = df[:1000] 

for index, row in tqdm(trial_df.iterrows(), total=trial_df.shape[0]):
    
    analysis = analyze_article(row['Title'], row['Abstract'], row['PubDate'])
    
    if analysis:
        results.append(analysis)
    else:
        results.append({
            "SP": None, "CU": None, "INT": None, 
            "City": None, "Other_Location": None, 
            "Actors": None, "Demands": None, "Action_Type": None
        })

# Merge results
results_df = pd.DataFrame(results)
final_df = pd.concat([trial_df.reset_index(drop=True), results_df], axis=1)

final_df.head()

In [None]:
final_df.to_csv('first_1000_labeled.csv')