### Import Libraries

In [None]:
import pandas as pd
import re
import itertools
import random
import numpy as np
import ast

In [None]:
category = 'Sexual_orientation'
df = pd.read_csv(f'Templates/{category}.csv')
df.head(5)

In [None]:
UNKNOWN_ANSWERS = [
    "can't answer", "unknown", "not enough info",
    "not enough information", "cannot answer", "can't be determined"
]

In [None]:
def parse_mapping(mapping_str):
    """
    Parse a mapping string like:
    '{{NAME1}}: [grandson, granddaughter]; {{NAME2}}: [grandfather, grandmother]'
    into dict: {'NAME1': ['grandson', 'granddaughter'], 'NAME2': ['grandfather', 'grandmother']}
    """
    mapping = {}
    if pd.isna(mapping_str):
        return mapping
    for part in mapping_str.split(";"):
        part = part.strip()
        if not part:
            continue
        key, val = part.split(":")
        key_clean = re.sub(r"[^A-Za-z0-9]", "", key).upper()
        try:
            values = ast.literal_eval(val.strip())
            if isinstance(values, str):
                values = [values]
        except Exception:
            values = [v.strip() for v in val.strip("[] ").split(",")]
        mapping[key_clean] = values
    return mapping

def get_all_combinations(names_map):
    """
    Given dict of placeholders -> list of values,
    return list of dicts with one value per placeholder (cross product)
    """
    keys = list(names_map.keys())
    combos = []
    for values in itertools.product(*[names_map[k] for k in keys]):
        combos.append({keys[i]: values[i] for i in range(len(keys))})
    return combos

def replace_placeholders(text, mapping):
    """Replace placeholders {{NAME}} in text with actual values."""
    if pd.isna(text):
        return text
    for placeholder, value in mapping.items():
        text = re.sub(r"\{\{\s*" + placeholder + r"\s*\}\}", value, text, flags=re.IGNORECASE)
    return text

def generate_examples(df):
    examples = []
    
    for _, row in df.iterrows():
        # Parse Names
        names_map = parse_mapping(row['Names'])
        
        # Merge in Lexical_diversity if present
        if 'Lexical_diversity' in row and not pd.isna(row['Lexical_diversity']):
            try:
                lex_map = parse_mapping(row['Lexical_diversity'])
                names_map.update(lex_map)
            except Exception:
                pass
        
        # Cross product of all placeholder values
        combos = get_all_combinations(names_map)
        
        for combo in combos:
            # Prepare replaced answers
            ans_neg = replace_placeholders(row['Answer_negative'], combo)
            ans_non_neg = replace_placeholders(row['Answer_non_negative'], combo)
            
            # Prepare replaced contexts
            amb_context = replace_placeholders(row['Ambiguous_Context'], combo)
            disamb_context = replace_placeholders(row['Disambiguating_Context'], combo)
            
            for q_polarity, question_col, ans_correct in [
                ("neg", 'Question_negative', ans_neg),
                ("non-neg", 'Question_non_negative', ans_non_neg)
            ]:
                question = replace_placeholders(row[question_col], combo)
                
                # Shuffle options
                options_list = [ans_neg, ans_non_neg, random.choice(UNKNOWN_ANSWERS)]
                random.shuffle(options_list)
                
                # Label (correct answer index)
                label_index = options_list.index(ans_correct) + 1
                
                # Target logic
                if row['Pairing'].strip().lower() == "biased":
                    target_ans = ans_neg
                else:  # anti
                    target_ans = ans_non_neg
                target_index = options_list.index(target_ans) + 1
                
                # Ambiguous example
                amb_opts = options_list.copy()
                amb_unknown = [opt for opt in amb_opts if opt in UNKNOWN_ANSWERS]
                amb_label_index = amb_opts.index(random.choice(amb_unknown)) + 1
                
                examples.append({
                    "Qid": row['Q_id'],
                    "Context_type": "amb",
                    "Context": amb_context,
                    "Question": question,
                    "Question_polarity": q_polarity,
                    "Option1": amb_opts[0],
                    "Option2": amb_opts[1],
                    "Option3": amb_opts[2],
                    "Options": {f"Option{i+1}": opt for i, opt in enumerate(amb_opts)},
                    "Label": amb_label_index,
                    "Target": target_index,
                    "Pairing": row['Pairing'],
                    "Proper_Noun": 0 if 'Proper Nouns' not in row or pd.isna(row.get('Proper Nouns', 0)) or row.get('Proper Nouns', 0) == 0 else 1,

                })
                
                # Disambiguating example
                examples.append({
                    "Qid": row['Q_id'],
                    "Context_type": "disamb",
                    "Context": amb_context+" "+disamb_context,
                    "Question": question,
                    "Question_polarity": q_polarity,
                    "Option1": amb_opts[0],
                    "Option2": amb_opts[1],
                    "Option3": amb_opts[2],
                    "Options": {f"Option{i+1}": opt for i, opt in enumerate(options_list)},
                    "Label": label_index,
                    "Target": target_index,
                    "Pairing": row['Pairing'],
                    "Proper_Noun": 0 if 'Proper Nouns' not in row or pd.isna(row.get('Proper Nouns', 0)) or row.get('Proper Nouns', 0) == 0 else 1,
                })
    
    return pd.DataFrame(examples)

In [None]:
examples_df = generate_examples(df)
examples_df

In [None]:
examples_df['Category'] = category
examples_df['id'] = examples_df.index + 1  # Assign unique IDs starting from 1
examples_df.sample(5)

In [None]:
examples_df.to_csv(f'Examples/{category}_examples.csv', index=False)