In [3]:
import pandas as pd
import json
from io import StringIO

# Step 1: Load the .txt file
with open("tt_subset_em.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# Step 2: Read it into a DataFrame using pandas
df = pd.read_csv(StringIO(raw_text))

# Step 3: Parse the `text_final` field which contains embedded JSON
def safe_parse_json(json_string):
    try:
        cleaned = json_string.strip().strip("`").strip("json").strip()
        return json.loads(cleaned)
    except Exception as e:
        return {"error": str(e)}

df["parsed_json"] = df["text_final"].apply(safe_parse_json)

# Step 4: Extract `actions` and `count`
df["actions"] = df["parsed_json"].apply(lambda x: x.get("actions", []))
df["count"] = df["parsed_json"].apply(lambda x: x.get("count", 0))

# Optional: Save to JSON
df.to_json("tt_subset_em_parsed.json", orient="records", indent=2)

print("✅ Done! Saved parsed output to 'tt_subset_em_parsed.json'")


✅ Done! Saved parsed output to 'tt_subset_em_parsed.json'


In [5]:
import json
import pandas as pd

# Step 1: Load your JSON file
with open("tt_subset_em_parsed.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Step 2: Convert to DataFrame
df = pd.DataFrame(data)

# Step 3: Parse 'best_res_perplex'
def parse_best_res_perplex(cell):
    try:
        cleaned = cell.strip().lstrip("```json").rstrip("```").strip()
        return json.loads(cleaned)
    except Exception as e:
        return {"error": str(e)}

df["parsed_best"] = df["best_res_perplex"].apply(parse_best_res_perplex)

# Step 4: Extract fields
df["actions"] = df["parsed_best"].apply(lambda x: x.get("actions", []) if isinstance(x, dict) else [])
df["count"] = df["parsed_best"].apply(lambda x: x.get("count", 0) if isinstance(x, dict) else 0)

# Step 5: Drop unnecessary columns
df.drop(columns=["best_res_perplex", "parsed_best"], inplace=True)

# Step 6: Save the cleaned and expanded data
df.to_json("tt_subset_em_expanded.json", orient="records", indent=2)
print("✅ Cleaned data saved to 'tt_subset_em_expanded.json'")


✅ Cleaned data saved to 'tt_subset_em_expanded.json'


In [17]:
import json
import numpy as np
from openai import OpenAI

# Load JSON
def load_json(json_path):
    with open(json_path, 'r') as f:
        return json.load(f)

# Prompt generator for each entry
def generate_gpt_prompt_from_json(entry):
    text_final = entry["text_final"]
    actions = entry["actions"]
    paragraphs = text_final.split("||")

    actions_section = "### Actions:\n"
    for action in actions:
        actions_section += f"Action ID: {action['id']}\nSummary: {action['summary']}\n\n"

    paragraphs_section = "### Text (paragraphs are separated by ||):\n"
    for i, para in enumerate(paragraphs):
        clean_para = para.strip().replace("\n", " ")
        paragraphs_section += f"Para {i}: {clean_para}\n"

    final_prompt = f"""### Task:
You are a social media design and feature expert analyzing a platform’s news release. Your goal is to identify and group all **distinct** digital well-being changes and map them to **all supporting paragraphs**, including those that may **loosely or indirectly** refer to each change.

### Your Process:
1. Read all paragraphs carefully, even if they appear generic or promotional.
2. Read the actions and summaries of all the actions.
3. Group together all paragraphs that refer to the same change — even indirectly.
   - Include introductory, descriptive, technical, policy-related, or promotional language as supporting evidence.
   - Be generous in mapping — include all paragraphs that help understand the change or its purpose.
4. Provide an uncertainty score (0–1) indicating your confidence in the classification.

{actions_section}
{paragraphs_section}

### Output Format (for each action):
- Action ID: [e.g., A1]
- Paragraphs: [list of paragraph numbers or excerpts]
- Uncertainty Score: [0–1]
"""
    return final_prompt, paragraphs

# Call GPT
def call_openai_gpt(prompt, client, model="gpt-4"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
    )
    return response.choices[0].message.content

# Parse GPT output robustly
import re
def parse_gpt_output(gpt_output):
    action_blocks = re.split(r"- Action ID:\s*", gpt_output)
    results = {}
    for block in action_blocks[1:]:
        lines = block.strip().split("\n")
        action_id = lines[0].strip()
        para_line = next((line for line in lines if "paragraph" in line.lower()), "")
        para_matches = re.findall(r"\d+", para_line)
        para_ids = [int(p) for p in para_matches]
        results[action_id] = para_ids
    return results

# Top-7 similarity
def get_top_7_similar_paragraph_texts(paragraphs, summary, client, model="text-embedding-3-small"):
    summary_embedding = client.embeddings.create(
        input=[summary],
        model=model
    ).data[0].embedding

    paragraph_embeddings = client.embeddings.create(
        input=paragraphs,
        model=model
    ).data

    def cosine_similarity(vec1, vec2):
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    similarities = [
        cosine_similarity(summary_embedding, para_emb.embedding)
        for para_emb in paragraph_embeddings
    ]

    top_7_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:7]
    top_7_paragraphs = [f"Para {i}: {paragraphs[i].strip()}" for i in top_7_indices]

    return top_7_paragraphs

# Update one entry
def annotate_entry(entry, client):
    gpt_prompt, paragraphs = generate_gpt_prompt_from_json(entry)
    gpt_output = call_openai_gpt(gpt_prompt, client)
    para_matches = parse_gpt_output(gpt_output)

    for action in entry["actions"]:
        action_id = action["id"]
        summary = action["summary"]
        action["matched_paragraphs"] = para_matches.get(action_id, [])
        action["top_7_similar_paragraphs"] = get_top_7_similar_paragraph_texts(paragraphs, summary, client)
    return entry

# === Run full pipeline ===
json_path = "tt_subset_em_expanded.json"
output_path = "all_entries_annotated.json"
api_key = "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"

client = OpenAI(api_key=api_key)

# Load and annotate all entries with actions
all_data = load_json(json_path)
annotated_data = []

for entry in all_data:
    if entry.get("actions"):
        print(f"🔍 Processing Unnamed: {entry['Unnamed: 0']}")
        try:
            updated_entry = annotate_entry(entry, client)
            annotated_data.append(updated_entry)
        except Exception as e:
            print(f"❌ Failed on Unnamed: {entry['Unnamed: 0']}: {e}")
            annotated_data.append(entry)  # Save original in case of failure
    else:
        annotated_data.append(entry)

# Save everything
with open(output_path, "w") as f:
    json.dump(annotated_data, f, indent=2)

print(f"✅ Done. Saved annotated file to: {output_path}")


🔍 Processing Unnamed: 0
🔍 Processing Unnamed: 4
🔍 Processing Unnamed: 6
🔍 Processing Unnamed: 8
🔍 Processing Unnamed: 12
🔍 Processing Unnamed: 13
🔍 Processing Unnamed: 15
🔍 Processing Unnamed: 16
🔍 Processing Unnamed: 17
🔍 Processing Unnamed: 20
🔍 Processing Unnamed: 22
🔍 Processing Unnamed: 23
🔍 Processing Unnamed: 25
🔍 Processing Unnamed: 26
🔍 Processing Unnamed: 27
🔍 Processing Unnamed: 28
🔍 Processing Unnamed: 29
🔍 Processing Unnamed: 30
🔍 Processing Unnamed: 31
🔍 Processing Unnamed: 33
🔍 Processing Unnamed: 34
🔍 Processing Unnamed: 35
✅ Done. Saved annotated file to: all_entries_annotated.json
