In [None]:
import os
import xml.etree.ElementTree as ET
import csv


# Main elements in the PropBank frame-files, variables:
def extract_target_rolesets(repository_path, output_tsv):
    """
    Extracts rolesets from the PB Lexicon github repository (or whatever repo is desired).

    Parameters:
    repository_path (str): The path to the directory containing XML files.

    Returns:
    dict: The dictionary containing extracted roleset information.
    """
    PBLexiconDictionary = {}

    # Traverse all files in the specified repository
    for root_dir, _, files in os.walk(repository_path):
        for file_name in files:
            if file_name.endswith(".xml"):  # Process only XML files
                file_path = os.path.join(root_dir, file_name)
                try:
                    tree = ET.parse(file_path)
                    root = tree.getroot()

                    # Iterate through all predicates in the XML file
                    predicates = root.findall(".//predicate")
                    for predicate in predicates:
                        predicate_lemma = predicate.get("lemma")

                        # Find all roleset elements within the predicate element
                        rolesets = predicate.findall(".//roleset")
                        for roleset in rolesets:
                            roleset_id = roleset.get("id")
                            roleset_name = roleset.get("name")

                        # initialize dictionary entry for roleset
                            if roleset_id not in PBLexiconDictionary:
                                PBLexiconDictionary[roleset_id] = {
                                    "definition": roleset_name,             #DONE
                                    "parent_predicate": predicate_lemma,    #DONE
                                    "source_file_name": file_name,          #DONE
                                    "aliases": {},                          #DONE
                                    "argaliases": {},                       #DONE
                                    "roles": {},                            #DONE
                                    "MWE_descriptions": {},                 #DONE
                                #    "MCP_descriptions": {},
                                    "entailments": {},                      #DONE
                                    "usagenotes": {},                       #DONE
                                    "lexlinks": [],                         #DONE
                                    "examples": {},
                                    "notes": []                             #DONE
                                }
                            else:
                                print(f"{roleset_id} has duplicate in lexicon.")


                    #DONE
                        # Get all aliases of the roleset
                            for alias in roleset.findall(".//alias"):
                                alias_name = f"{alias.text}-{alias.get('pos', 'unknown')}"
                                if alias_name not in PBLexiconDictionary[roleset_id]["aliases"]:
                                    PBLexiconDictionary[roleset_id]["aliases"][alias_name] = {
                                        "alias_pos": alias.get("pos"),
                                        "alias_lemma": alias.text
                                    }
                                else: 
                                    print(f"{roleset_id} alias {alias_name} has duplicate.")

                        
                    #DONE
                        # Get all argaliases of the roleset
                            for argalias in roleset.findall(".//argalias"):
                                argalias_name = f"{argalias.text}-{argalias.get('pos', 'unknown')}"
                                if argalias_name not in PBLexiconDictionary[roleset_id]["argaliases"]:
                                    PBLexiconDictionary[roleset_id]["argaliases"][argalias_name] = {
                                        "argalias_pos": argalias.get("pos"),
                                        "argalias_lemma": argalias.text,
                                        "argalias_arg": argalias.get("arg")
                                    }
                                else: 
                                    print(f"{roleset_id} argalias {argalias_name} has duplicate.")


                    #DONE
                        # Get all roles of the roleset
                            for role in roleset.findall(".//role"):
                                role_ID = "ARG" + role.get("n", "")
                                
                                if role_ID not in PBLexiconDictionary[roleset_id]["roles"]:
                                    PBLexiconDictionary[roleset_id]["roles"][role_ID] = {
                                        "function_tag": role.get("f"),
                                        "role_def": role.get("descr"),
                                        "rolelinks": []
                                    }
                                else: 
                                    print(f"{roleset_id} role {role_ID} has duplicate.")

                                #rolelinks
                                for rolelink in role.findall(".//rolelink"):
                                    PBLexiconDictionary[roleset_id]["roles"][role_ID]["rolelinks"].append({
                                        "class": rolelink.get("class"), 
                                        "resource": rolelink.get("resource"), 
                                        "version": rolelink.get("version"), 
                                        "role": rolelink.text
                                    })

                    #DONE
                        # Get all MWE descriptions for aliases (not connected to alias entry currently)
                            for MWE in roleset.findall(".//mwp-descriptions"):
                                MWE_id = f"{MWE.get('id')}-{MWE.get('pos')}"

                                if MWE_id not in PBLexiconDictionary[roleset_id]["MWE_descriptions"]:
                                    PBLexiconDictionary[roleset_id]["MWE_descriptions"][MWE_id] = {
                                        "literal": MWE.findtext(".//source", "UNKNOWN"),
                                        "figurative": MWE.findtext(".//target", "UNKNOWN"),
                                        "slots": None,  
                                        "tokens": [] 
                                    }
                                else:
                                    print(f"{roleset_id} MWE {MWE_id} has a duplicate.")

                                for desc in MWE.findall(".//syntaxdesc"):                                    
                                    PBLexiconDictionary[roleset_id]["MWE_descriptions"][MWE_id]["slots"] = desc.get('slots', 'UNKNOWN')

                                    for token in desc.findall(".//token"): 
                                        PBLexiconDictionary[roleset_id]["MWE_descriptions"][MWE_id]["tokens"].append({
                                            "token": token.text or "UNKNOWN",
                                            "arg": token.get('arg', 'UNKNOWN'),
                                            "dep": token.get('dep', 'UNKNOWN'),
                                            "head": token.get('head', 'UNKNOWN'),
                                            "pos": token.get('pos', 'UNKNOWN'),
                                            "slot": token.get('slot', 'UNKNOWN')
                                        })


                    #DONE
                        # Get all entailments (hobbsian spatial entailments) for the roleset
                            for entailment in roleset.findall(".//hobbsian"):
                                PBLexiconDictionary[roleset_id]['entailments'] = {
                                    entailment.text
                                }


                    #TODO (I don't believe there are any of these yet, but do once I start adding them.)
                        # Get all MCP descriptions for aliases (not connected to alias entry currently)
                        #    MCPs = roleset.findall(".//mcp-descriptions")
                        #    for MCP in MCPs:
                        #        MCP_id = MCP.get("id")

                        #        if MCP_id not in PBLexiconDictionary[roleset_id]["MCP_descriptions"]:
                        #            PBLexiconDictionary[roleset_id]["MCP_descriptions"][MCP_id] = {
                        #                "MCP_morphosyntaxdesc": {
                        #                    "slots": "",
                        #                    "morphs": {}
                        #                },
                        #            }
                                
                                    # token/slot syntax descriptions
                        #            morphosyntaxdescs = MCP.findall(".//syntaxdesc")
                        #            morphosyntax_slots = morphosyntaxdescs.get("slots")
                                    
                        #            PBLexiconDictionary[roleset_id]["MCP_descriptions"][MCP_id]["slots"] = morphosyntax_slots

                        #            for morphosyntaxdesc in morphosyntaxdescs:
                        #                morphs = morphosyntaxdesc.findall(".//morph")
                        #                for morph in morphs:
                        #                    morph_arg = morph.get("arg")
                        #                    morph_dep = morph.get("dep")
                        #                    morph_head = morph.get("head")
                        #                    morph_pos = morph.get("pos")
                        #                    morph_slot = morph.get("slot")
                        #                    morph_value = morph.text

                        #                    PBLexiconDictionary[roleset_id]["MCP_descriptions"][MCP_id]["morphs"].append({
                        #                        "morph_arg": morph_arg,
                        #                        "morph_dep": morph_dep,
                        #                        "morph_head": morph_head,
                        #                        "morph_pos": morph_pos,
                        #                        "morph_slot": morph_slot,
                        #                        "morph_value": morph_value
                        #                    })
                                
                                    # literal graphs
                        #            sources = MCP.findall(".//source")
                        #            for source in sources:
                        #                mcp_literal = source.text

                        #                PBLexiconDictionary[roleset_id]["MCP_descriptions"][MCP_id]["MCP_literal_meaning"] = mcp_literal

                                    # figurative graphs
                        #            targets = MCP.findall(".//target")
                        #            for target in targets:
                        #                mcp_figurative = target.text

                        #                PBLexiconDictionary[roleset_id]["MCP_descriptions"][MCP_id]["MCP_figurative_meaning"] = mcp_figurative
                            
                        #        else: 
                        #            print(f"{roleset_id} mcp description {MCP_id} has duplicate entry.")
                            

                    #DONE
                        # Get all usage notes
                            for usage in roleset.findall(".//usage"):
                                usagenote_id = f"{usage.get('resource')}-{usage.get('version')}"
                                if usagenote_id not in PBLexiconDictionary[roleset_id]["usagenotes"]:
                                    PBLexiconDictionary[roleset_id]["usagenotes"][usagenote_id] = usage.get("inuse")
                                else: 
                                    print(f"{roleset_id} usagenote {usagenote_id} has duplicate.")

  
                    #DONE
                        # Get all lexlinks
                            for lexlink in roleset.findall(".//lexlink"):
                                PBLexiconDictionary[roleset_id]["lexlinks"].append({
                                    "resource": lexlink.get("resource") or "-", 
                                    "version": lexlink.get("version") or "-", 
                                    "class": lexlink.get("class") or "-", 
                                    "confidence": lexlink.get("confidence") or "-", 
                                    "src": lexlink.get("src") or "-"
                                })

                    #DONE
                        # Get all examples
                            for example in roleset.findall(".//example"):
                                example_id = f"example_{len(PBLexiconDictionary[roleset_id]['examples']) + 1}"
                                PBLexiconDictionary[roleset_id]["examples"][example_id] = {
                                    "example_name": example.get("name"),
                                    "example_src": example.get("src"),
                                    "sentence": example.findtext(".//text", ""),
                                    "PB_annot": {"relations": [], "arguments": []},
                                    "AMR_annot": [],
                                    "UMR_annot": []
                                }
                                

                            # Get all PropBank annotations for example
                                for rel in example.findall(".//rel"):
                                    PBLexiconDictionary[roleset_id]["examples"][example_id]["PB_annot"]["relations"].append({
                                        "relation": rel.text.strip() if rel.text else "UNKNOWN",
                                        "relloc": rel.get("relloc", "UNKNOWN")
                                    })

                                for arg in example.findall(".//arg"):
                                    PBLexiconDictionary[roleset_id]["examples"][example_id]["PB_annot"]["arguments"].append({
                                        "text": arg.text.strip() if arg.text else "UNKNOWN",
                                        "arg": arg.get("type", "UNKNOWN"),
                                        "start": arg.get("start", "UNKNOWN"),
                                        "end": arg.get("end", "UNKNOWN")
                                    })


                            # Get all AMRs for the example
                                for AMR in roleset.findall(".//amr"):
                                    AMR_id = f"AMR-{AMR.get('version')}"
                                    PBLexiconDictionary[roleset_id]["examples"][example_id]["AMR_annot"].append({
                                        'amr_id': AMR_id,
                                        'graph': AMR.text.strip() if AMR.text else "UNKNOWN"
                                    })

                            # Get all UMRs of the example
                                for UMR in roleset.findall(".//umr"):
                                    UMR_id = f"UMR-{UMR.get('version')}"
                                    PBLexiconDictionary[roleset_id]["examples"][example_id]["UMR_annot"].append({
                                        'umr_id': UMR_id,
                                        'graph': UMR.text.strip() if UMR.text else "UNKNOWN"
                                    })


                    #DONE
                        # Get all notes
                            for note in roleset.findall(".//note"):
                                if note.text:
                                    PBLexiconDictionary[roleset_id]["notes"].append(note.text.strip())
                            
            # Handle XML parsing errors            
                except ET.ParseError as e:
                    print(f"Error parsing file {file_path}: {e}")  

        
    # Writing to TSV file
    with open(output_tsv, "w", newline="", encoding="utf-8") as tsvfile:
        writer = csv.writer(tsvfile, delimiter="\t")
        writer.writerow(["Roleset ID", "Definition", "Parent Predicate", "Source File", "Aliases", "Usage Notes"])     #, "Roles", "Lexlinks", "Examples", "Notes"

        for roleset_id, data in PBLexiconDictionary.items():

            mwe_output = '"' + "\n".join([
                f"MWE: {mwe_id} (slots= {mwe_data.get('slots', 'UNKNOWN')}) " +
                ("\n" + "\n".join([
                    f"  TOKEN: {token['token']} (slot={token['slot']}, arg={token['arg']}, dep={token['dep']}, head={token['head']}, pos={token['pos']})"
                        for token in mwe_data["tokens"]
                        ]) if mwe_data["tokens"] else "") +
                f"\nLiteral mapping: \n{mwe_data.get('literal', 'UNKNOWN')}" +
                f"\nFigurative mapping: {mwe_data.get('figurative', 'UNKNOWN')}"  # FIXED: Correct key name
                for mwe_id, mwe_data in data["MWE_descriptions"].items()
            ]) + '"'
                
            examples_output = '"' + "\n".join([
                f"{example_id} (src= {example_data.get('example_src', 'UNKNOWN')}): \n  {example_data.get('sentence', 'UNKNOWN')}" +        # text
                ("\n" + "\n".join([f"    REL: {rel['relation']} (loc={rel['relloc']})"                                                    # PB annotation
                        for rel in example_data["PB_annot"]["relations"]]) if example_data["PB_annot"]["relations"] else "") +
                ("\n" + "\n".join([f"    {arg['arg']}: {arg['text']} (start={arg['start']}, end={arg['end']})" 
                        for arg in example_data["PB_annot"]["arguments"]]) if example_data["PB_annot"]["arguments"] else "") +
                ("\n" + "\n".join([f"  {amr['amr_id']}: \n        {amr['graph']}"                                                          # AMR annotation
                        for amr in example_data["AMR_annot"]]) if example_data["AMR_annot"] else "") +
                ("\n" + "\n".join([f"  {umr['umr_id']}: \n        {umr['graph']}"                                                          # UMR annotation
                        for umr in example_data["UMR_annot"]]) if example_data["UMR_annot"] else "")
                for example_id, example_data in data["examples"].items()
            ]) + '"'

            argalias_output = '"' + "\n".join([
                f"{argalias_name}: ARG{argalias_data['argalias_arg']}"
                for argalias_name, argalias_data in data["argaliases"].items()
            ]) + '"'

            roles_output = '"' + "\n".join([
                f"{role_id}: {role_data['role_def']}" +
                ("\n" + "\n".join([f"    {link['role']}, {link['class']}, {link['resource']}-{link['version']}" 
                        for link in role_data["rolelinks"]]) if role_data["rolelinks"] else "")
                for role_id, role_data in data["roles"].items()
            ]) + '"'

            usagenotes_output = '"' + "\n".join([
                f"{usagenote_id or 'NONE'}: {inuse or 'NONE'}" for usagenote_id, inuse in data["usagenotes"].items() 
            ]) + '"'

            lexlinks_output = '"' + "\n".join([
                f"{link['resource']}-{link['version']}: {link['class']} (Confidence: {link['confidence']}, Source: {link['src']})"
                for link in data["lexlinks"]
            ]) + '"'


            writer.writerow([
                roleset_id,
                data["definition"],
                data["parent_predicate"],
                data["source_file_name"],
                '"' + "\n".join(data["aliases"].keys()) + '"',
                argalias_output,
                roles_output,
                usagenotes_output,
                lexlinks_output,
                mwe_output,
                examples_output,
                '"' + "\n".join(data["notes"]) + '"',
                data["entailments"]
            ])

    return PBLexiconDictionary


SyntaxError: invalid syntax (3329039527.py, line 347)

In [None]:
repository_path = "C:/Users/littl/OneDrive/Documents/GitHub/PROPBANK-FRAMES/frames"
output_tsv = "output_PBdict_friday.tsv"
data = extract_target_rolesets(repository_path, output_tsv)

up.11 has duplicate in lexicon.
