In [6]:
import instructor, asyncio, random
from pydantic import BaseModel, Field
from typing import List, Literal, Optional
from collections import Counter, defaultdict
from openai import AsyncOpenAI
import yaml
import os
import json

In [14]:
os.environ['OPENAI_API_KEY'] = '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'
client = instructor.from_openai(AsyncOpenAI(api_key=os.environ['OPENAI_API_KEY']))

In [26]:
def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:  # skip empty lines
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError as e:
                print(f"⚠️ Skipping line {lineno} in {os.path.abspath(path)} due to JSON error: {e}")
                continue

def write_jsonl(path, data):
    with open(path, "w") as f:
        for ex in data:
            f.write(json.dumps(ex) + "\n")

config_file_path = 'prompts/moral.yaml'
with open(config_file_path, 'r') as f:
    config = yaml.safe_load(f)
#print(config["system_prompt"])

In [27]:
class ValueItem(BaseModel):
    value: Literal[
        "Care/Harm",
        "Equality/Inequality",
        "Proportionality/Exploitation",
        "Loyalty/Betrayal",
        "Authority/Subversion",
        "Sanctity/Degradation",
        "Liberty/Oppression",
        "Honor/Shame",
        "Ownership/Theft"
        "None"
    ]
    side: Literal["Virtue", "Violation"] = Field(
        description="Virtue if the positive side of the pair is expressed (e.g., Care, Equality). "
                    "Violation if the negative side is expressed (e.g., Harm, Inequality)."
    )
    rationale: str = Field(min_length=1)
    confidence: float = Field(default=1, ge=0, le=1)

class Extraction(BaseModel):
    values: List[ValueItem]

# --- Step 3: One extraction run ---
async def extract_once(frame: str, prblem_def: str, sys: str) -> Extraction:
    return await client.chat.completions.create(
        model="o4-mini",
        response_model=Extraction,
        messages=[
            {"role": "system", "content": sys},
            {"role": "user", "content": f"FRAME:\n{frame}\nAssociated misogyny problems with defintion:\n{prblem_def}"}
        ],
        temperature=1.0,
        seed=random.randint(1, 1_000_000_000),
    )

# --- Step 4: Self-consistency aggregation ---
def aggregate_self_consistency(parsed_runs: List[Extraction], top_k=3) -> dict:
    votes = Counter()
    rationales = defaultdict(list)

    for run in parsed_runs:
        seen_this_run = set()
        for item in run.values:
            key = f"{item.value}:{item.side}"  # combine value + side
            if key in seen_this_run:
                continue
            seen_this_run.add(key)
            votes[key] += 1
            rationales[key].append(item.rationale)

    total_runs = len(parsed_runs)
    ranked = votes.most_common()
    selected = ranked[:top_k]

    return {
        "consensus_values": [
            {
                "value_side": v,
                "vote_count": c,
                "vote_share": round(c / total_runs, 3),
                "sample_rationales": rationales[v][:3],
            }
            for v, c in selected
        ],
        "all_votes": votes,
        "runs": [r.dict() for r in parsed_runs],
    }

# --- Step 5: Multi-run wrapper ---
async def extract_values_self_consistency(
    frame: str,
    prblem_def: str,
    sys: str = config["system_prompt"],
    k: int = 5,
    top_k: int = 3,
    concurrency: int = 3,
) -> dict:
    sem = asyncio.Semaphore(concurrency)

    async def _task():
        async with sem:
            return await extract_once(frame, prblem_def, sys)

    results = await asyncio.gather(*[_task() for _ in range(k)])
    return aggregate_self_consistency(results, top_k=top_k)


# --- Usage Example ---
# frame_text = "Treating women well is a mistake as it will inevitably backfire on men."
# out = asyncio.run(extract_values_self_consistency(frame_text, k=9))
# print(out["consensus_values"])

In [28]:
with open("cleaned_misogyny_problem_definitions.json", "r") as f:
    PROBLEM_DEFINITIONS = json.load(f)
frame_problems_dict = {}

for f in read_jsonl('frame_problem_info.jsonl'):
    frame_problems_dict[f['id']] = {
        "frame": f['frame'],
        "problems": f['problems']
    }

In [29]:
for f in sorted(os.listdir('final_frames')):
    folder_path = os.path.join('final_frames', f)
    if os.path.isdir(folder_path) and os.path.exists(os.path.join(folder_path, 'moral.jsonl')):
        continue 

    ffilename = os.path.join('final_frames', f, 'frames.jsonl')

    moral_values=[]
    for frame in read_jsonl(ffilename):
        idf= frame['id']
        problems = frame_problems_dict[idf]['problems']
        problems_definitions_str = "\n".join(
    f"{problem.replace('_', ' ').title()}: {PROBLEM_DEFINITIONS.get(problem, 'No definition available')}"
    for problem in problems
        
)
        out = await extract_values_self_consistency(frame['frame'], problems_definitions_str, config['system_prompt'], k=3)
        moral_values.append({
            "id": idf,
            "frame": frame['frame'],
            "values": out["consensus_values"],
            "all_votes": out["all_votes"],
            "runs": out["runs"]

        })
    
    write_jsonl(os.path.join('final_frames', f, 'moral.jsonl'), moral_values) 
    print(f"Done {f} problem") 
    

⚠️ Skipping line 1 in /mnt/c/Users/raksh/Downloads/wd/cultural/cultural/final_frames/trivializing_consent/frames.jsonl due to JSON error: Expecting value: line 1 column 2184 (char 2183)
⚠️ Skipping line 2 in /mnt/c/Users/raksh/Downloads/wd/cultural/cultural/final_frames/trivializing_consent/frames.jsonl due to JSON error: Extra data: line 1 column 11 (char 10)
⚠️ Skipping line 3 in /mnt/c/Users/raksh/Downloads/wd/cultural/cultural/final_frames/trivializing_consent/frames.jsonl due to JSON error: Extra data: line 1 column 11 (char 10)
⚠️ Skipping line 4 in /mnt/c/Users/raksh/Downloads/wd/cultural/cultural/final_frames/trivializing_consent/frames.jsonl due to JSON error: Extra data: line 1 column 10 (char 9)
⚠️ Skipping line 5 in /mnt/c/Users/raksh/Downloads/wd/cultural/cultural/final_frames/trivializing_consent/frames.jsonl due to JSON error: Extra data: line 1 column 11 (char 10)
⚠️ Skipping line 6 in /mnt/c/Users/raksh/Downloads/wd/cultural/cultural/final_frames/trivializing_consent/f

/tmp/ipykernel_68729/2513359587.py:67: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  "runs": [r.dict() for r in parsed_runs],


Done trivializing_consent problem
Done trivializing_eating_disorders problem
Done trivializing_infidelity problem
Done trivializing_mental_health_issues problem
Done trivializing_oppression problem
Done trivializing_prostitution problem
Done trivializing_serious_issues problem
Done trivializing_sexual_assault problem
Done trivializing_the_need_for_representation problem
Done trivializing_women_s_issues problem
Done trivializing_women_s_sexual_satisfaction problem
Done undermining_women_s_capabilities problem
Done undermining_women_s_rights_movements problem
Done unrealistic_beauty_standards problem
Done victim_blaming problem
Done violence problem
Done wage_disparity problem
Done women_subjugation problem
Done women_valuation problem
Done workplace_discrimination problem
