In [3]:

!pip install OpenAI tenacity sentence_transformers

Collecting OpenAI
  Downloading openai-1.45.0-py3-none-any.whl.metadata (22 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Collecting httpx<1,>=0.23.0 (from OpenAI)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from OpenAI)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->OpenAI)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->OpenAI)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.45.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.1/374.1 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24

In [4]:
import json
import base64
import numpy as np
import heapq
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

#another method of calling openai
from openai import OpenAI, BadRequestError
from openai.types.chat import ChatCompletion
from tenacity import retry, stop_after_attempt, wait_random_exponential
import time
from typing import Optional


  from tqdm.autonotebook import tqdm, trange


In [5]:

def read_jsonl(path):
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if line:
                ex = json.loads(line)
                yield ex

def write_jsonl(path, data):
    with open(path, "w") as f:
        for i, ex in enumerate(data):
            try:
                f.write(json.dumps(ex) + "\n")
            except TypeError as e:
                print(f"Error writing element at index {i}: {ex}")
                print(f"TypeError: {e}")

def write_to_jsonl(data, filename):
    with open(filename, 'w') as file:
        for key, value in data.items():
            for entry in value:
                record = {"problem": key}
                record.update(entry)
                file.write(json.dumps(record) + "\n")

def read_from_jsonl_ind(filename):
    data = defaultdict(list)

    with open(filename, 'r') as file:
        for line in file:
            record = json.loads(line.strip())
            problem = record.pop("problem")
            data[problem].append(record)

    # Convert defaultdict back to a regular dict
    return dict(data)

In [6]:

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

class MinimumDelay:
    def __init__(self, delay: float | int):
        self.delay = delay
        self.start = None

    def __enter__(self):
        self.start = time.time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        end = time.time()
        seconds = end - self.start
        if self.delay > seconds:
            time.sleep(self.delay - seconds)

@retry(wait=wait_random_exponential(min=1, max=90), stop=stop_after_attempt(3))
def chat(client: OpenAI, delay: float | int, **kwargs) -> ChatCompletion | None:
    try:
        with MinimumDelay(delay):
            return client.chat.completions.create(**kwargs)
    except BadRequestError as e:
        print(f"Bad Request: {e}")
        if "safety" in e.message:
            return None
        raise e
    except Exception as e:
        print(f"Exception: {e}")
        raise e

In [7]:
def handle_completion(client2, message, max_tokens=512, temperature=1.0, top_p=0.7, seed=0):

    chat_completion= client2.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=message,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            seed=seed,
        )
    extracted_data = chat_completion.choices[0].message.content
    return extracted_data

In [8]:
from collections import defaultdict
import json
updated_merged_dict=read_from_jsonl_ind('updated_merged_dict_afterp.jsonl')
subproblems=list(read_jsonl('subproblems_1c.jsonl'))

subproblems_list=[]
for subproblem in subproblems:

  if 'subproblem(' not in subproblem['response']:
    continue
  pt=(subproblem['response'].split('subproblem(')[1]).split(')')[0].split(', ')
  if len(pt)>1:
    subproblems_list.append(tuple(pt))

In [None]:
from collections import defaultdict
problems=list(updated_merged_dict.keys())
hierarchical_graph = defaultdict(list)

for child, parent in subproblems_list:
    hierarchical_graph[parent].append(child)

# Dictionaries to store the relationships
parent_to_children = defaultdict(list)
child_to_parent = {}

# Build the graph relationships
for child, parent in subproblems_list:
    parent_to_children[parent].append(child)
    child_to_parent[child] = parent

# Optional: Add isolated problems without parents (nodes with no incoming edges)
for problem in problems:
    if problem not in hierarchical_graph and problem not in [child for child, parent in subproblems_list]:
        hierarchical_graph[problem] = []

# Function to recursively print the hierarchy
def print_hierarchy(graph, parent, level=0):
    print("  " * level + parent)
    for child in graph[parent]:
        print_hierarchy(graph, child, level + 1)

# Find root nodes (problems that have no parents)
roots = [problem for problem in hierarchical_graph if problem not in [child for child, parent in subproblems_list]]

'''# Print the hierarchical graph
for root in roots:
    print_hierarchy(hierarchical_graph, root)'''


'# Print the hierarchical graph\nfor root in roots:\n    print_hierarchy(hierarchical_graph, root)'

In [9]:
# Function to find all subproblems (children) recursively
def find_all_subproblems(problem, parent_to_children):
    subproblems = []
    if problem in parent_to_children:
        for child in parent_to_children[problem]:
            subproblems.append(child)
            subproblems.extend(find_all_subproblems(child, parent_to_children))
    return subproblems

# Function to find all superproblems (parents) recursively
def find_all_superproblems(problem, child_to_parent):
    superproblems = []
    while problem in child_to_parent:
        parent = child_to_parent[problem]
        superproblems.append(parent)
        problem = parent  # Move up the hierarchy
    return superproblems

def find_problem_by_frame(problem_dict, target_frame):
    for problem, details in problem_dict.items():
      for x in details:
        if x["frame"] == target_frame:
          return problem

    return None


def get_frame_ids(problem, updated_merged_dict):
    if problem in updated_merged_dict:
        frame_ids = [v.get('frame_id') for v in updated_merged_dict[problem] if 'frame_id' in v]
        return frame_ids
    else:
        return None

def filter_tuples_by_index(list1, list2):
    indices_to_remove = {i for _, i in list2}

    filtered_list = [(dist, i) for dist, i in list1 if i not in indices_to_remove]

    return filtered_list

def process_frame(current_index, frames, a_embs, current_mask, top_k, frame_ids_to_consider, threshold=None):
    frame_ids_to_consider = [frame_id - 1 for frame_id in frame_ids_to_consider]

    current_emb = a_embs[current_index]
    distances = [
        (calculate_distance(current_emb, a_emb), i)
        for i, a_emb in enumerate(a_embs)
        if i in frame_ids_to_consider and current_mask[i] == 0 and i != current_index
    ]
    # Filter distances by threshold if provided
    trels=[]
    if threshold is not None:
        tdistances = [(dist, i) for dist, i in distances if dist < threshold]
        if tdistances is not None:
          for dist, i in tdistances:
            pr="The relation is paraphrase because the similarity distance is less than the threshold."
            rt="paraphrases"
            relation= {"type": rt,
                  "x": i,
                  "y": current_index,
                  "reasoning":pr,}
            trels.append(relation)

          distances = filter_tuples_by_index(distances, tdistances)

    # If top_k is greater than the number of distances, use all distances
    if top_k > len(distances):
        top_k_distances = distances
    else:
        top_k_distances = heapq.nsmallest(top_k, distances)


    lines = ["Similar known framings:"]
    f_map = {}
    i = 1
    for dist, f_idx in top_k_distances:
        f_text = frames[f_idx]
        lines.append(f"{i}: {f_text}")
        f_map[i] = f_idx
        i += 1
    lines.append("New framing:")
    text = frames[current_index]
    f_map[i] = current_index
    lines.append(f"{i}: {text}")

    return "\n".join(lines), f_map, top_k_distances, trels

def extract_relations2(content, f_map):
    relations = []
    reasoning = None  # Initialize reasoning to avoid undefined variable issues
    try:
        for line in content.split("\n"):
            line = line.strip()
            if not line:
                continue

            mt = line.split(":")[0].strip()
            line_content = line[len(mt) + 1 :].strip()

            if "a" in mt:
                reasoning = line_content  # Set the reasoning when you encounter an "a" line
            elif "b" in mt:
                try:
                    # Extract relation type and frame indices
                    rt, c = line_content.split("(")
                    rt = rt.lower().strip()
                    x, y = c[:-1].split(",")

                    # Add the relation to the list
                    relations.append(
                        {
                            "type": rt,
                            "x": f_map[int(x.strip())],
                            "y": f_map[int(y.strip())],
                            "reasoning": reasoning,  # Use the current reasoning
                        }
                    )

                    # Reset reasoning after using it for a relation
                    reasoning = None

                except (ValueError, KeyError) as e:
                    # Log the error and continue
                    print(f"Error processing line: {line} - {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return relations





def rel_order(rel):
    if rel["type"] == "paraphrases":
        return -1
    elif rel["type"] == "specializes":
        return 0
    elif rel["type"] == "contradicts":
        return 1
    else:
        return 2

def update_mask_and_relations(relations, current_index, frames, current_mask):
    for rel in sorted(relations, key=lambda x: rel_order(x)):
        if rel["type"] == "paraphrases":
            if len(frames[rel["x"]]) < len(frames[rel["y"]]):
                current_mask[rel["x"]] = 1.0
                current_mask[rel["y"]] = 0.0
            break
        elif rel["type"] in ["specializes", "contradicts"]:
            current_mask[current_index] = 1.0
            break
        else:
            print(f'Unknown relation type: {rel["type"]}')
    return current_mask

def extract_frames(message):
    if message == '':
        return []
    found_frames = []
    message = re.sub(r'\n\n', r'\n', message)
    message=re.sub(r'\\n', r'\n', message)
    message_list = message.split("\n") if "\n" in message else message.split("\\n")
    for line in message_list:
        line = line.strip()
        if not line:
            continue
        try:
            m_id = line.split(":")[0]
            if m_id is not None:
                try:
                    mf_id, mt = m_id.split(".")
                except:
                    mt=m_id
                    mf_id=1
        except:
            return []
        content = line[len(m_id) + 1 :].strip()
        if mt == "b":
            found_frames.append(content)
    return found_frames


In [10]:

def extract_relations(content, f_map):
    relations = []
    lines = content.split("\n")
    reasoning_list=[]
    rel_list=[]
    for i, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue

        mt = line.split(":")[0].strip()
        line_content = line[len(mt) + 1:].strip()

        if "a" in mt:
            reasoning_list.append(line_content)
            reasoning = line_content  # Set the reasoning for the upcoming relation
        elif "b" in mt:
          rel_list.append(line_content)

    if len(rel_list) == 0 or len(reasoning_list)==0:
      return relations
    for reason,rel in zip(reasoning_list,rel_list):
      try:
        rt, c = rel.split("(")
        rt = rt.lower().strip()
        x, y = c[:-1].split(",")
      except:
        continue
      try:
        relations.append(
            {
                "type": rt,
                "x": f_map[int(x.strip())],
                "y": f_map[int(y.strip())],
                "reasoning": reason,  # Use the current reasoning
            })

      except:
        continue


    return relations



In [None]:
#this is necessary because '\n' I wrote in excel are read as string '\\n' not as newline character '\n'. Either don't use '\n'  or use this code to replace '\\n' with '\n'
import pandas as pd
import json
import numpy as np
# Read data from Excel
df = pd.read_excel('relations_demo3.xlsx')

# Convert DataFrame to dictionary (or list of dictionaries)
data = df.to_dict(orient='records')

# Function to handle newline characters
def handle_newlines(record):
    for key, value in record.items():
        if isinstance(value, str):
            record[key] = value.replace('\\n', '\n')
    return record

def handle_nan(record):
    for key, value in record.items():
        if isinstance(value, float) and np.isnan(value):
            record[key] = None
    return record

# Apply the function to handle newlines
processed_data = [handle_newlines(record) for record in data]
processed_data = [handle_nan(record) for record in data]

# Write to JSONL file
with open('relations_demo3.jsonl', 'w') as file:
    for record in processed_data:
        file.write(json.dumps(record) + '\n')


In [None]:
#relation code
# Function to calculate the Euclidean distance
frame_file='predictions_whole/articulations-unique.jsonl'
total_files=list(read_jsonl(frame_file))
frames=[]
for j in total_files:
    frames.append(j['text'])
len(frames)
def calculate_distance(emb1, emb2):
    return np.sum((emb1 - emb2) ** 2)
embed = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
a_embs = embed.encode([f for f in frames], show_progress_bar=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/288 [00:00<?, ?it/s]

In [None]:
all_relations = []
current_index = 1
responses = []
current_mask = np.zeros(shape=[len(frames)], dtype=np.float32)
current_mask[0] = 1.0
top_k=20

In [None]:
#define your api_key
client2 = OpenAI(api_key=api_key, timeout=90)

sys = '''You are an expert linguistic assistant.
Frames of communication select particular aspects of an issue and make them salient in communicating a message.
Frames of communication are ubiquitous in social media discourse and can impact how people understand issues and, more importantly, how they form their opinions.
Misogyny is defined as dislike of, contempt for, or ingrained prejudice against women.'''


message=[{"role": "system", "content": sys},]

num_demos=6
demos_path='relations_demo3.jsonl'
demos=list(read_jsonl(demos_path))
demos=demos[:num_demos]

def add_m(message,frame=None,demo=True,msg=None,):


    message.append({"role":"user","content":[{"type":"text","text": frame}]})
    if demo:
        message.append({"role":"assistant","content":[{"type":"text","text":msg}]})
    return message



for d in demos:
    user_p='You will be tasked with identifying relationships between the given misogyny frames. You should discuss your reasoning first, and then provide a final decision. Each framing provided may or may not be involved in a single relationship with one framing from a provided set of similar framings. We will consider two possible relationships:\n1. Paraphrases(X,Y): X and Y say essentially the same exact thing, with different words or phrasing. If one person agreed with X, they would agree with Y, and vice versa. Frames should share the same cause and the same problem to be considered paraphrases.\n2. Contradicts(X,Y): X and Y contradict each other, such that they frame the same exact issue from opposing perspectives. If one person agreed with X, they would disagree with Y, and vice versa. The two frames X and Y should essentially paraphrase each other, sharing the same problem and cause but from opposing perspectives.\n3. No relationship: There are no relationships between the new framing and any of the provided framings.\nYour first step for each framing is\n(a) Reason about if the framing holds one of the above relationships with any of the provided framings.\nMultiple relationships could be true, but prioritize in the order provided: If a paraphrase relationship holds, it must be provided.\nIf there is no paraphrase, then look for contradicts.\nFinally, if there is no contradicts relationship, answer no relationship.\nIf a relationship is identified, then\n(b) State that relationship, using the IDs for each framing.'

    if d["relation"] is None:
        msg=d["rationale"]
    else:
        msg=d["rationale"]+d["relation"]
    message=add_m(message,frame= user_p+'\n'+ d["frames"],demo=True,msg=msg)
print(message)
message1=message.copy()

[{'role': 'system', 'content': 'You are an expert linguistic assistant.\nFrames of communication select particular aspects of an issue and make them salient in communicating a message.\nFrames of communication are ubiquitous in social media discourse and can impact how people understand issues and, more importantly, how they form their opinions.\nMisogyny is defined as dislike of, contempt for, or ingrained prejudice against women.'}, {'role': 'user', 'content': [{'type': 'text', 'text': 'You will be tasked with identifying relationships between the given misogyny frames. You should discuss your reasoning first, and then provide a final decision. Each framing provided may or may not be involved in a single relationship with one framing from a provided set of similar framings. We will consider two possible relationships:\n1. Paraphrases(X,Y): X and Y say essentially the same exact thing, with different words or phrasing. If one person agreed with X, they would agree with Y, and vice ver

In [None]:
with tqdm(total=len(frames)) as pbar:

    for i, frame in enumerate(frames):
      problem=find_problem_by_frame(updated_merged_dict, frame)
      if problem is None:
        continue
      frame_ids=get_frame_ids(problem, updated_merged_dict)
      line, f_map, top_k_distances, trels=process_frame(i, frames, a_embs, current_mask, top_k, frame_ids, threshold=0.2)
      message=add_m(message,line,demo=False)
      response=handle_completion(client2, message)
      #print(response)
      responses.append(response)
      relations = extract_relations(response, f_map)
      #print(relations)

      if len(trels)>0:
        relations.extend(trels)
      if len(relations) == 0:
        current_mask[i] = 1.0
      else:
        update_mask_and_relations(relations, i, frames, current_mask)
        all_relations.extend(relations)

      message=message1.copy()
      pbar.update(1)


 95%|█████████▌| 8762/9192 [3:18:52<09:45,  1.36s/it]


In [None]:
from collections import defaultdict

def format_reasoning(r):
    bt_idx = r.find(" between")
    as_idx = r.find(", as")
    if bt_idx != -1 and as_idx != -1:
        r = r[:bt_idx] + r[as_idx:]
    return r.strip()

save_path = "relations_3.jsonl"
cleaned_relations = []
for rel in all_relations:
    cleaned_relations.append(
        {
            "type": rel["type"],
            "x": int(rel["x"]),
            "y": int(rel["y"]),
            "reasoning": format_reasoning(rel["reasoning"]),
        }
        )
rc = defaultdict(int)
for rel in cleaned_relations:
    rc[rel["type"]] += 1
for k, v in sorted(rc.items(), key=lambda x: x[1], reverse=True):
    print(k, v)

write_jsonl(save_path,cleaned_relations)
write_jsonl("responses_rel3.jsonl",responses)

In [None]:
import json
def read_jsonl(path):
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if line:
                ex = json.loads(line)
                yield ex

frame_file='predictions_whole/articulations-unique.jsonl'
total_files=list(read_jsonl(frame_file))
frames=[]
for j in total_files:
    frames.append(j['text'])
len(frames)

def remove_duplicate_relations(relations):
    seen = set()
    unique_relations = []

    for rel in relations:
        # Create a tuple of (x, y) to identify duplicates
        xy_pair = (frames[rel['x']], frames[rel['y']])
        yx_pair = (frames[rel['y']], frames[rel['x']])
        if xy_pair in seen:
            continue
        if yx_pair in seen:
            continue

        if (rel['x']!=rel['y']):
            seen.add(xy_pair)
            unique_relations.append(rel)

    return unique_relations

relations=list(read_jsonl('relations_3.jsonl'))
unique_relations = remove_duplicate_relations(relations)
print(len(unique_relations))
write_jsonl('unique_relations_all.jsonl',unique_relations)


In [None]:
import json
import re
relations=list(read_jsonl('unique_relations_all.jsonl'))
frame_file='predictions_whole/articulations-unique.jsonl'
total_files=list(read_jsonl(frame_file))
for relation in relations:
  relation['framex']=total_files[relation['x']]['text']
  relation['framey']=total_files[relation['y']]['text']
write_jsonl('unique_relations_all_frames.jsonl',relations)

In [11]:
import networkx as nx
from collections import defaultdict

def reduce_paraphrases(frames, relations):
    g = nx.Graph()

    # Add nodes for each frame
    for f_idx, frame in enumerate(frames):
        g.add_node(f_idx)

    # Add edges for paraphrase relations
    for edge in relations:
        if edge["type"] == "paraphrases":
            g.add_edge(edge["x"], edge["y"])

    kept_nodes = set()
    node_map = {}
    reduced_count = defaultdict(int)

    most_connected_frame = None
    least_connected_frame = None
    max_degree = -1
    min_degree = float('inf')

    # Process each connected component
    for c in nx.connected_components(g):
        max_node = None
        max_deg = -1

        # Find the node with the highest degree in the component
        for n in c:
            d = g.degree[n]
            if d > max_deg:
                max_deg = d
                max_node = n

            # Track the most and least connected frames overall
            if d > max_degree:
                max_degree = d
                most_connected_frame = n
            if d < min_degree:
                min_degree = d
                least_connected_frame = n

        kept_nodes.add(max_node)

        # Map all nodes in the component to the chosen max_node
        for n in c:
            node_map[n] = max_node
            reduced_count[max_node] += frames[n]["count"]

    reduced_relations = []
    seen_relations = set()  # Set to track seen relations

    # Remap relations to use kept nodes
    for edge in relations:
        if edge["type"] != "paraphrases":
            remapped_x = node_map[edge["x"]]
            remapped_y = node_map[edge["y"]]
            # Ensure that remapped relations are unique
            if (remapped_x, remapped_y, edge["type"]) not in seen_relations:
                reduced_relations.append(
                    {
                        "type": edge["type"],
                        "x": remapped_x,
                        "y": remapped_y,
                        "reasoning": edge["reasoning"],
                    }
                )
                seen_relations.add((remapped_x, remapped_y, edge["type"]))
                seen_relations.add((remapped_y, remapped_x, edge["type"]))  # Add reverse to avoid duplicates

    # Keep only the reduced frames
    reduced_frames = {f_idx: f for f_idx, f in enumerate(frames) if f_idx in kept_nodes}

    # Print most and least connected frames
    print(f"Most connected frame: Frame {most_connected_frame} with {max_degree} connections")
    print(f"Least connected frame: Frame {least_connected_frame} with {min_degree} connections")

    return reduced_frames, reduced_relations, kept_nodes, reduced_count


In [None]:
import os
def clean_reasoning(r):
    as_idx = r.find(", as ")
    if as_idx != -1:
        as_idx += len(", as ")
        r = r[as_idx:]
    return r.strip().capitalize()

def read_jsonl(path):
    examples = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    ex = json.loads(line)
                    examples.append(ex)
                except Exception as e:
                    print(e)
    return examples

out_dir='relevance_whole_final'
ann_dir='annotations_whole_final'
in_dir='predictions_whole'
os.makedirs(out_dir,exist_ok=True)
os.makedirs(ann_dir,exist_ok=True)
frames = read_jsonl(os.path.join(in_dir, 'articulations-unique.jsonl'))
cleaned_relations=read_jsonl('unique_relations_all_frames.jsonl')
embed = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
#count_problems(frames)

print(f"Found {len(frames)} frames before paraphrase reduction")
print(f"Found {len(cleaned_relations)} relations before paraphrase reduction")
reduced_frames, reduced_relations, kept_nodes, reduced_count = reduce_paraphrases(
    frames, cleaned_relations
)
print(f"Found {len(reduced_frames)} frames after paraphrase reduction")
print(f"Found {len(reduced_relations)} relations after paraphrase reduction")


In [None]:

from collections import defaultdict
updated_merged_dict=read_from_jsonl_ind('updated_merged_dict_afterp.jsonl')
subproblems=list(read_jsonl('subproblems_1c.jsonl'))

subproblems_list=[]
for subproblem in subproblems:

  if 'subproblem(' not in subproblem['response']:
    continue
  pt=(subproblem['response'].split('subproblem(')[1]).split(')')[0].split(', ')
  if len(pt)>1:
    subproblems_list.append(tuple(pt))

In [None]:
def count_total_frames(child_to_parent, parent_to_child, updated_merged_dict):
    # Step 1: Find roots (nodes with no parents)
    all_nodes = set(parent_to_child.keys()).union(set(child_to_parent.keys()))
    roots = [node for node in all_nodes if node not in child_to_parent]

    # Step 2: DFS to count total frames for each node and its descendants
    def dfs_count(node):
        # Get the number of frames at the current node
        num_frames = len(updated_merged_dict.get(node, []))

        # Recursively count frames for all children
        for child in parent_to_child.get(node, []):
            num_frames += dfs_count(child)

        return num_frames

    # Step 3: Traverse from each root and print the total frames count
    for root in roots:
        total_frames = dfs_count(root)
        print(f"Root: {root}, Total frames (including descendants): {total_frames}")

# Example usage
count_total_frames(child_to_parent, parent_to_children, updated_merged_dict)


Root: disrespect towards women, Total frames (including descendants): 26
Root: stereotyping, Total frames (including descendants): 148
Root: pseudoscience, Total frames (including descendants): 2
Root: violence, Total frames (including descendants): 55
Root: exploitation of women by men, Total frames (including descendants): 6
Root: patriarchal attitudes, Total frames (including descendants): 150
Root: dehumanization of women, Total frames (including descendants): 33
Root: objectification, Total frames (including descendants): 155
Root: trivializing serious issues, Total frames (including descendants): 70
Root: shaming, Total frames (including descendants): 30
Root: discrimination of women, Total frames (including descendants): 79
