In [1]:
from tqdm import tqdm
import joblib
import re
import spacy
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
import sys
import json
#import stanza
#stanza.download('pt')
import numpy as np
# install and import amr utils
sys.path.append('../')
#!pip install ../amr-utils
from amr_utils.amr_readers import AMR_Reader

In [2]:
folder = '../POS-tagger-portuguese-nltk/trained_POS_taggers/'
tagger_nltk = joblib.load(folder+'POS_tagger_brill.pkl')

In [3]:
def extract_feat_spacy(text,nlp_model):
    
    
    doc = nlp_model(text)
    
    tokens = []

    dict_an_snt = {}

    for i,token in enumerate(doc):
        
        tokens.append(token.text)
        
        dict_an_snt.update({
            
            
            i: {"text": token.text,
            "lemma": token.lemma_,
            "pos":token.pos_,
            "tag":token.tag_,
            "dep":token.dep_,
            "shape":token.shape_,
            "is_alpha":token.is_alpha,
            "is_stop":token.is_stop,
            "morph": str(token.morph),
            "head_index": token.head.i,
            "ner": None 
            }
        })
        
    for ent in doc.ents:
        
        for i in range(ent.start,ent.end): 
            dict_an_snt[i]["ner"] = ent.label_
            dict_an_snt[i]["ner_start_end"] = (ent.start,ent.end)
            
    response = {
        "sentence": text,
        "tokens": tokens,
        "annotated_sentence": dict_an_snt
    }
            
        
    return response

In [4]:
def amr_to_dict(amr):
    
    dict_amr = {}
    id_snt = amr.id
    nodes = amr.nodes
    metadata = amr.metadata
    tokens = amr.tokens
    graph = amr.graph_string()
    
    dict_edges = {}
    id = 0
    # cria dict dos nos
    for node1_id, edge_value, node2_id in amr.edges:
        id +=1
        
        dict_edges.update({
            f'edge {id}': {
                'nodes_ids': (node1_id,node2_id),
                'nodes': (nodes.get(node1_id),nodes.get(node2_id)),
                'value': edge_value
            }})
        
    if tokens == []:
        snt = metadata['snt']
        tokens_nltk = word_tokenize(snt, language='portuguese')
    else: 
        snt = " ".join(tokens)
        tokens_nltk = word_tokenize(snt, language='portuguese')
      
    
    # verfica se ha tokens na anotacao (se nao tiver tokens, ele considera snt como tokens)
    if "tok pt" not in amr.amr_string():
        tokens = []
        
    dict_amr.update({'id': id_snt})
    dict_amr.update({'nodes': nodes})
    dict_amr.update({"edges": dict_edges})
    dict_amr.update(metadata)
    dict_amr.update({"graph": graph})
    dict_amr.update({"tok pt": tokens})
    dict_amr.update({"tokens_nltk": tokens_nltk})
    dict_amr.update({"snt": snt})
    return dict_amr

    


def ner_with_spacy(
    text,
    model_name = "pt_core_news_sm"
):
    nlp = spacy.load(model_name)
    
    doc = nlp(text)
    
    entidades = []

    for ent in doc.ents:
        entidade = (ent.text,  ent.label_,(ent.start,ent.end))
        entidades.append(entidade)
        
    return entidades

def pos_tagger_nltk(
    tokenized_sentence,
    tagger_nltk
):
    
    # anota os tokens 
    pos_tags_annotation = tagger_nltk.tag(tokenized_sentence)
    
    # cria lista apenas com os tags, sem o token
    pos_tags = [tag[1] for tag in pos_tags_annotation]
    
    return pos_tags

def remove_num_text(string):
    
    if type(string) != str:
        return None 
    
    
    
    # Expressão regular para corresponder ao padrão [palavra]-[numero]
    padrao = re.compile(r'^(\w+)-\d+$')
    
    # Tentativa de encontrar correspondência
    correspondencia = padrao.match(string)
    
    if correspondencia:
        # Se encontrar, retorna apenas a palavra (primeiro grupo da regex)
        return correspondencia.group(1)
    else:
        # Caso contrário, retorna a string original
        return string


In [5]:
path_lp = '../data/raw/little_prince.txt'
path_opisums = '../data/raw/opisums.txt'
path_news = '../data/raw/news.txt'
path_sci = '../data/raw/science.txt'

list_paths = [
    ('sci',path_sci),
    
    ('lp', path_lp),
    ('opisums',path_opisums),
    ('news',path_news),
]

In [6]:
def parse_alignment(text):
    alignment_line = ""
    
    # Procura pela linha que contém os alinhamentos bramr
    for line in text.split('\n'):
        if line.startswith("# ::alignments-bramr"):
            alignment_line = line
            break
    
    # Verifica se a linha de alinhamento está vazia ou não foi encontrada
    if not alignment_line or alignment_line.strip() == "# ::alignments-bramr":
        return None
    
    # Remove o prefixo para obter os alinhamentos
    alignment_line = alignment_line.replace("# ::alignments-bramr ", "").strip()
    
    # Inicializa o dicionário de resultados
    alignment_dict = {}
    
    # Processa cada par de alinhamento
    for pair in alignment_line.split():
        token_index, node = pair.split('-')
        alignment_dict[node] = int(token_index)
    
    return alignment_dict

In [7]:
df_features = pd.DataFrame({
    'sentence_id': [],
    "corpus_name": [],
    'parent': [],
    'child': [],
    'parent_pos': [],
    'child_pos': [],
    'parent_ner': [],
    'child_ner': [],
    'dependency_role': [],
    'parent_position': [],
    'child_position': [],
    'label':[]
})

nlp_model_spacy = spacy.load("pt_core_news_lg")

list_dict_an = []
for corpus_name, path in list_paths:
    
    print(f"""##################################################
# Running: {corpus_name}
##################################################""")
        
    # usa reader de amr para fazer o parsing
    reader = AMR_Reader()
    amrs = reader.load(path, remove_wiki=True)
    
    # cria bloco de textos anotados
    with open(path, 'r') as file:
        str_corpus = file.read()
    blocos = str_corpus.split('\n\n')
    blocos = [bloco for bloco in blocos if bloco != ""]
        
    list_dicts = []
    # para cada sentenca, cria dict com os nós e vértices
    for i,amr in tqdm(enumerate(amrs), total = len(amrs)):
        
        texto_anotado = blocos[i]
        dict_aligments = parse_alignment(texto_anotado)
        
        amr_string = amr.amr_string() # obtem string formatada amr        
        
        dict_annotation = amr_to_dict(amr) 
        dict_annotation.update({'corpus_name': corpus_name})
        dict_annotation.update({'dict_aligments': dict_aligments})
        list_dicts.append(dict_annotation)
        
    print('Anotando texto ...')    
    for dict_an in tqdm(list_dicts):
        
        tokens = dict_an['tokens_nltk']
        
        if tokens == []:
            tokens = dict_an['tok pt']
        
        # tok_pos = pos_tagger_nltk(
        # tokenized_sentence = tokens,
        # tagger_nltk = tagger_nltk
        # )
          
        dict_spacy = extract_feat_spacy(
            text = dict_an['snt'],
            nlp_model = nlp_model_spacy)
        
                
        dict_an.update(dict_spacy)
        #dict_an.update({'tok pos': tok_pos})
        dict_an.update({"corpus_name": corpus_name})
        
        list_dict_an.append(dict_an)

    print()
    
with open('../data/processed/annotated_text.json', 'w') as f:
    json.dump(list_dict_an, f)

cannot deinvert attribute: ('s2', ':op2-of', 'and')


##################################################
# Running: sci
##################################################
[amr] Loading AMRs from file: ../data/raw/science.txt


100%|██████████| 160/160 [00:00<00:00, 4368.78it/s]


Anotando texto ...


100%|██████████| 160/160 [00:00<00:00, 180.68it/s]



##################################################
# Running: lp
##################################################
[amr] Loading AMRs from file: ../data/raw/little_prince.txt


100%|██████████| 1525/1525 [00:00<00:00, 4464.05it/s]


Anotando texto ...


100%|██████████| 1525/1525 [00:08<00:00, 185.73it/s]
ignoring epigraph data for duplicate triple: ('p', ':instance', 'pai')



##################################################
# Running: opisums
##################################################
[amr] Loading AMRs from file: ../data/raw/opisums.txt


100%|██████████| 404/404 [00:00<00:00, 4352.10it/s]


Anotando texto ...


100%|██████████| 404/404 [00:02<00:00, 178.60it/s]



##################################################
# Running: news
##################################################
[amr] Loading AMRs from file: ../data/raw/news.txt


100%|██████████| 870/870 [00:00<00:00, 6612.88it/s]


Anotando texto ...


100%|██████████| 870/870 [00:04<00:00, 200.58it/s]





In [9]:
dict_annotation

{'id': 'ciencia-4-171',
 'nodes': {'1': 'possible-01',
  '1.1': 'chamar-01',
  '1.1.1': 'nós',
  '1.1.2': 'fenômeno',
  '1.1.3': 'efeito',
  '1.1.3.1': 'coca-cola'},
 'edges': {'edge 1': {'nodes_ids': ('1', '1.1'),
   'nodes': ('possible-01', 'chamar-01'),
   'value': ':ARG1'},
  'edge 2': {'nodes_ids': ('1.1', '1.1.1'),
   'nodes': ('chamar-01', 'nós'),
   'value': ':ARG0'},
  'edge 3': {'nodes_ids': ('1.1', '1.1.2'),
   'nodes': ('chamar-01', 'fenômeno'),
   'value': ':ARG1'},
  'edge 4': {'nodes_ids': ('1.1', '1.1.3'),
   'nodes': ('chamar-01', 'efeito'),
   'value': ':ARG2'},
  'edge 5': {'nodes_ids': ('1.1.3', '1.1.3.1'),
   'nodes': ('efeito', 'coca-cola'),
   'value': ':mod'}},
 'alignments': 'bren',
 'graph': '(p/possible-01\n\t:ARG1 (c/chamar-01\n\t\t:ARG0 (n/nós)\n\t\t:ARG1 (f/fenômeno)\n\t\t:ARG2 (e/efeito\n\t\t\t:mod (c2/coca-cola))))',
 'tok pt': [],
 'tokens_nltk': ['Podemos',
  'chamar',
  'o',
  'fenômeno',
  'de',
  '``',
  'efeito',
  'Coca-Cola',
  '``',
  '.'],
 'sn

In [8]:
para

NameError: name 'para' is not defined

In [None]:
list_dict_an

[{'id': '1',
  'nodes': {'1': 'ter-01',
   '1.1': 'person',
   '1.1.1': 'name',
   '1.1.2': 'explicar-01',
   '1.1.2.2': 'coisa',
   '1.1.2.2.1': 'esse',
   '1.1.2.2.2': 'resultar-01',
   '1.1.1.1': '"Meyer"',
   '1.1.2.3': '2'},
  'edges': {'edge 1': {'nodes_ids': ('1.1.1', '1.1.1.1'),
    'nodes': ('name', '"Meyer"'),
    'value': ':op1'},
   'edge 2': {'nodes_ids': ('1.1.2', '1.1.2.3'),
    'nodes': ('explicar-01', '2'),
    'value': ':quant'},
   'edge 3': {'nodes_ids': ('1', '1.1'),
    'nodes': ('ter-01', 'person'),
    'value': ':ARG0'},
   'edge 4': {'nodes_ids': ('1.1', '1.1.1'),
    'nodes': ('person', 'name'),
    'value': ':name'},
   'edge 5': {'nodes_ids': ('1.1', '1.1.2'),
    'nodes': ('person', 'explicar-01'),
    'value': ':ARG1'},
   'edge 6': {'nodes_ids': ('1.1.2', '1.1'),
    'nodes': ('explicar-01', 'person'),
    'value': ':ARG0'},
   'edge 7': {'nodes_ids': ('1.1.2', '1.1.2.2'),
    'nodes': ('explicar-01', 'coisa'),
    'value': ':ARG1'},
   'edge 8': {'nodes_

In [None]:
# pos que não podem estar contidas no amr (descritas no artigo)
# punct tambem removi pq tbm nao pode e esta no artigo de certa forma
pos_filter = ["SCONJ", "DET", "ADJ", "PUNCT"]

In [None]:
def find_nodes_ids(
    annotated_sentence,
    edge_amr_info
):
    
    parent, child = edge_amr_info['nodes']
    
    
    parent = remove_num_text(parent)
    child = remove_num_text(child) 
    
    

    if parent != None:
        
        # acha a posicao de todos os tokens no texto que sao iguais a parent
        parent_w_app = [id for id, an in annotated_sentence.items() if (an["lemma"].casefold() == parent.casefold() or an["text"].casefold() == parent.casefold()) and an["pos"] not in pos_filter]
        
        # todo nome pessoal vira person na anotação, então precisa achar qual nome tem de pessoa tem no texto
        if len(parent_w_app) == 0 and parent == "person":
            
            parent_w_app = [id for id, an in annotated_sentence.items() if an["ner"] == "PER"]
            
            if len(parent_w_app) >1:
                
                # nomes compostos vao ocupar dois tokens. Para verificar se é um nome composto parta olha o inicio e fim do ner
                inicio_fim = annotated_sentence[parent_w_app[0]]['ner_start_end'] # basta pegar o inicio e fim do primeiro token
                
                # verifica se o inicio e fim bate com os ids encontrados
                contains = True
                for i in parent_w_app:
                    if i not in range(inicio_fim[0], inicio_fim[1]):
                        contains = False
                

                if contains:
                    
                    parent_w_app = [tuple(parent_w_app)]
        
    else:

        parent_w_app = None

    if child != None:
        
        # acha a posicao de todos os tokens no texto que sao iguais a child
        child_w_app = [id for id, an in annotated_sentence.items() if (an["lemma"].casefold() == child.casefold() or  an["text"].casefold() == child.casefold()) and an["pos"] not in pos_filter]

        # todo nome pessoal vira person na anotação, então precisa achar qual nome tem de pessoa tem no texto
        if len(child_w_app) == 0 and child == "person":
            
            child_w_app = [id for id, an in annotated_sentence.items() if an["ner"] == "PER"]
            
            if len(child_w_app) >1:
                
                # nomes compostos vao ocupar dois tokens. Para verificar se é um nome composto parta olha o inicio e fim do ner
                inicio_fim = annotated_sentence[child_w_app[0]]['ner_start_end'] # basta pegar o inicio e fim do primeiro token
                
                # verifica se o inicio e fim bate com os ids encontrados
                contains = True
                for i in child_w_app:
                    if i not in range(inicio_fim[0], inicio_fim[1]):
                        contains = False
                

                if contains:
                    
                    child_w_app = [tuple(child_w_app)]
                
        
    else: 
        
        child_w_app = None
        
    len_parent_w_app = len(parent_w_app) if parent_w_app is not None else None
    len_child_w_app = len(child_w_app) if child_w_app is not None else None
    
    if len_parent_w_app == 1 and len_child_w_app == 1:

        
        # uso de set para não importar a ordem da tupla
        pair = (parent_w_app[0], child_w_app[0])
        parent_id, child_id = pair
        
        if type(parent_id) != tuple:
            parent_to_child = False
            if annotated_sentence[parent_id]["head_index"] == child_id:
                parent_to_child = True
                
        else:
            parent_to_child = False
            for id in parent_id:
                if annotated_sentence[id]["head_index"] == child_id:
                    parent_to_child = True 
            
        
        if type(child_id) != tuple:
            child_to_parent = False
            if annotated_sentence[child_id]["head_index"] == parent_id:
                child_to_parent = True
        else:
            child_to_parent = False
            for id in child_id:
                if annotated_sentence[id]["head_index"] == parent_id:
                    child_to_parent = True                 

            
        return {"response": {"pair":pair, "dep_parent_to_child": parent_to_child, "dep_child_to_parent": child_to_parent, "type": f"parent_{len_parent_w_app}_child_{len_child_w_app}"}}
    
    # se achou 1 e outro não. Vale também para quando um deles não existe (None)
    if (len_parent_w_app == 1 and len_child_w_app == 0) or (len_parent_w_app == 0 and len_child_w_app == 1) or (len_parent_w_app == 1 and len_child_w_app == None) or (len_parent_w_app == None and len_child_w_app == 1): 
        
        if len(parent_w_app) == 0:
            
            parent_id = None
        else: 
            
            parent_id = parent_w_app[0]
        if len(child_w_app) == 0:
            child_id = None
        else:
            child_id = child_w_app[0]
        
        pair = (parent_id, child_id)
        parent_to_child = False
        child_to_parent = False
        return {"response": {"pair":pair, "dep_parent_to_child": parent_to_child, "dep_child_to_parent": child_to_parent, "type": f"parent_{len_parent_w_app}_child_{len_child_w_app}"}}
        #return "miss_one"
        
        
    
    return {"response": {"pair":(None, None), "dep_parent_to_child": None, "dep_child_to_parent": None, "type": f"parent_{len_parent_w_app}_child_{len_child_w_app}"}}
def get_dependency_direction(
    annotated_sentence,
    parent_id, 
    child_id
):
    
    parent_to_child = False     
    if parent_id is not None:
        if annotated_sentence[parent_id]["head_index"] == child_id:
            parent_to_child = True
     
    child_to_parent = False       
    if child_id is not None:
        if annotated_sentence[child_id]["head_index"] == parent_id:
            child_to_parent = True
        
    return parent_to_child, child_to_parent

import os


list_rows = []
list_errors_match = [] #(debug)
list_n_match = [] # (debug)
for dict_an in tqdm(list_dict_an):
    
    edges = dict_an['edges']
    
    # create dep pairs
    dep_pairs = []
    for tk_id, an in dict_an['annotated_sentence'].items():
        p = an
        p_head_id = an['head_index']
        p_head = dict_an['annotated_sentence'].get(p_head_id)
        # nao salva os pares que possuem os pos que nao podem ser amr
        if p['pos'] in pos_filter or p['pos'] in p_head: continue
        
        dep_pairs.append((tk_id,p_head_id))
    
    for edge_id, edge_info in edges.items():
        
        parent, child = edge_info['nodes']
        
        value = edge_info['value']
        
        if "arg" not in value.casefold(): continue
        
        if dict_an['dict_aligments'] is None:
            
            # acha os ids dos nós
            response = find_nodes_ids(
            annotated_sentence = dict_an['annotated_sentence'],
            edge_amr_info = edge_info)
            
            pair = response['response']['pair']
            
            
            parent_id, child_id = pair
                
            
            dep_parent_to_child = response['response']['dep_parent_to_child']
            dep_child_to_parent = response['response']['dep_child_to_parent']
            
            
            type_response = response['response']['type']
            
            
        else:
            
            parent_id = dict_an['dict_aligments'].get(edge_info['nodes_ids'][0], None)
            child_id = dict_an['dict_aligments'].get(edge_info['nodes_ids'][1], None) 

            
            pair = (parent_id, child_id)
            
            dep_parent_to_child, dep_child_to_parent = get_dependency_direction(annotated_sentence=dict_an['annotated_sentence'], parent_id=parent_id, child_id=child_id)
            
            type_response = "match_aligments"
            
            print(dep_parent_to_child, dep_child_to_parent)

        ann_sent = dict_an['annotated_sentence']
        
        if not dep_parent_to_child and  not dep_child_to_parent:
            #print("dep", dep_parent_to_child, dep_child_to_parent)
            dep = "nao_tem_dep"
            
        else: 
            
            if dep_parent_to_child and not dep_child_to_parent:
                
                dep = ann_sent[child_id]['dep']
                
            elif dep_child_to_parent and not dep_parent_to_child:
                
                if type(child_id) != tuple:
                
                    dep = ann_sent[child_id]['dep']
                else:
                    dep_list = []
                    for id in child_id:
                        dep_list.append(ann_sent[id]['dep'])
                        
                    # remove a dep que indica que o token é continuação de um nome (ex: anotonio eduardo, eduardo tem dep flat:name)
                    dep_list = [dep for dep in dep_list if dep != "flat:name"]
                    
                    
                    if len(dep_list) == 1:
                        dep = dep_list[0]
                    else:
                        raise Exception("erro")    
                    
                    
                    
            else:
                # a palavra tem dependencia com ela mesma
                if parent_id == child_id:
                    
                    # pega a dependencia (nao importa se pega do pai ou do filho)
                    dep = ann_sent[child_id]['dep']
                    
                else:
                
                    raise Exception("Há dependencia de duas mãos") 
                
                
                            
        #### lembrar de colocar o parent como o token do parent, nao oq veio do amr
        new_row = {
            "corpus_name": dict_an['corpus_name'],
            "id": dict_an['id'],
            "parent": remove_num_text(parent) if parent is not None else None,
            "child":remove_num_text(child) if child is not None else None,
            "label": edge_info["value"],
            "dep": dep
        }
        
        # anotacoes inuteis
        useless_cols = ["head_index", "dep", "id"]
        
        if parent_id is not None:
            
            if type(parent_id) != tuple:
                ann_parent = ann_sent[parent_id]
                dict_parent = {f'parent_{key}':value for key, value in ann_parent.items() if key not in useless_cols}
                
            else:
                
                for key, value in ann_parent.items():
                    
                    if key not in useless_cols:
                        
                        values = [ann_sent[id][key] for id in parent_id]
                        
                        # checa se todos os elementos são iguais
                        if all(i == values[0] for i in values):
                            # todos os elementos sao iguais
                            final_value = values[0]
                        else:
                            # sao diferentes
                            final_value = " ".join(values)
                            
                        
                        dict_parent.update({f'parent_{key}': final_value})
            
        else: 
            dict_parent = {f'parent_{key}':"token_nao_encontrado_no_texto" for key in ['text', 'lemma', 'pos', 'tag', 'shape', 'is_alpha', 'is_stop', 'morph', 'ner']}
        
        
        # cria dict com as features
        if child_id is not None:
            
            if type(child_id) != tuple:
                ann_child = ann_sent[child_id]
                dict_child = {f'child_{key}':value for key, value in ann_child.items() if key not in useless_cols}
                
            else:
                
                for key, value in ann_child.items():
                    
                    if key not in useless_cols:
                        
                        values = [ann_sent[id][key] for id in child_id]
                        
                        # checa se todos os elementos são iguais
                        if all(i == values[0] for i in values):
                            # todos os elementos sao iguais
                            final_value = values[0]
                        else:
                            # sao diferentes
                            final_value = " ".join([str(value) for value in values])
                            
                        
                        dict_child.update({f'child_{key}': final_value})
            
        else:
            dict_child = {f'child_{key}':"token_nao_encontrado_no_texto" for key in ['text', 'lemma', 'pos', 'tag', 'shape', 'is_alpha', 'is_stop', 'morph', 'ner']}


        new_row.update(dict_parent)
        new_row.update(dict_child)        
        list_rows.append(new_row)
        list_errors_match.append(type_response)
            
        
df = pd.DataFrame(list_rows)
df

100%|██████████| 2959/2959 [00:00<00:00, 19414.34it/s]

True False
False True
False True
False True
False True
False False
False True
False False
False True
False False
True False
False False
True False
True False
False True
False False
False False
False True
False True
False True
False False
False True
True False
False False
True True
False True
False False
True False
False False
True False
False True
False True
True True
False True
False True
False True
False True
True False
False True
False True
False True
False True
False True
False True
False True
False False
False False
False False
False True
False False
False True
False True
False True
True True
False False
False False
False False
False False
False True
False True
False False
False False
False True
False False
False True
False False
False True
False False
False False
False True
False False
False False
False True
False True
False True
False True
False False
False False
True False
False True
False True
False False
False True
False True
False False
False True
False True
True False
False




Unnamed: 0,corpus_name,id,parent,child,label,dep,parent_text,parent_lemma,parent_pos,parent_tag,...,child_lemma,child_pos,child_tag,child_shape,child_is_alpha,child_is_stop,child_morph,child_ner,child_ner_start_end,parent_ner_start_end
0,sci,1,ter,person,:ARG0,nsubj,tem,ter,VERB,VERB,...,Meyer,PROPN,PROPN,Xxxxx,True,False,Gender=Masc|Number=Sing,PER,"(0, 1)",
1,sci,1,person,explicar,:ARG1,nao_tem_dep,Meyer,Meyer,PROPN,PROPN,...,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,,"(0, 1)"
2,sci,1,explicar,person,:ARG0,nao_tem_dep,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,...,Meyer,PROPN,PROPN,Xxxxx,True,False,Gender=Masc|Number=Sing,PER,"(0, 1)",
3,sci,1,explicar,coisa,:ARG1,nao_tem_dep,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,...,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,,
4,sci,1,coisa,resultar,:ARG2-of,nao_tem_dep,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,...,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7041,news,esporte-4-169,person,confirmar,:ARG0-of,nao_tem_dep,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,...,confirmar,VERB,VERB,xxxx,True,False,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,,,
7042,news,ciencia-4-171,possible,chamar,:ARG1,nao_tem_dep,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,...,chamar,VERB,VERB,xxxx,True,False,VerbForm=Inf,,,
7043,news,ciencia-4-171,chamar,nós,:ARG0,nao_tem_dep,chamar,chamar,VERB,VERB,...,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,token_nao_encontrado_no_texto,,
7044,news,ciencia-4-171,chamar,fenômeno,:ARG1,obj,chamar,chamar,VERB,VERB,...,fenômeno,NOUN,NOUN,xxxx,True,False,Gender=Masc|Number=Sing,,,


In [None]:
import re

def create_alignment(text):
    # Extrai os tokens do texto anotado
    tokens_line = ""
    amr_line = ""
    
    for line in text.split('\n'):
        if line.startswith("# ::tok"):
            tokens_line = line
        elif not line.startswith("#"):
            amr_line = line
    
    if not tokens_line or not amr_line:
        return None
    
    # Remove o prefixo e separa os tokens
    tokens = tokens_line.replace("# ::tok ", "").strip().split()
    
    # Extrai os nós do AMR usando regex para encontrar padrões (nós)
    nodes = re.findall(r'\((\S+)', amr_line)
    
    # Se o número de tokens não coincide com o número de nós, retornamos None
    if len(tokens) != len(nodes):
        return None
    
    # Cria o dicionário de alinhamentos
    alignment_dict = {node: i for i, node in enumerate(nodes)}
    
    return alignment_dict

# Exemplos de uso
text_without_alignments = """# ::id poder-2-35-33
# ::tok Tem que voltar já .
# ::tok-en 
# ::alignments-bren 
(o / obligate-01~e.1 :ARG2 (v / voltar-01~e.2 :ARG1 (e / ele~e.0) :time (j / já~e.3)))"""

print(create_alignment(text_without_alignments))  # Saída esperada: {'o': 0, 'v': 1, 'e': 2, 'j': 3}


None


In [None]:
df.columns

Index(['corpus_name', 'id', 'parent', 'child', 'label', 'dep', 'parent_text',
       'parent_lemma', 'parent_pos', 'parent_tag', 'parent_shape',
       'parent_is_alpha', 'parent_is_stop', 'parent_morph', 'parent_ner',
       'child_text', 'child_lemma', 'child_pos', 'child_tag', 'child_shape',
       'child_is_alpha', 'child_is_stop', 'child_morph', 'child_ner',
       'child_ner_start_end', 'parent_ner_start_end'],
      dtype='object')

In [None]:
unique, counts = np.unique(list_errors_match, return_counts=True) 
print(np.asarray((unique, counts)).T)

[['match_aligments' '1074']
 ['parent_0_child_0' '1052']
 ['parent_0_child_1' '1309']
 ['parent_0_child_2' '77']
 ['parent_0_child_3' '8']
 ['parent_0_child_4' '2']
 ['parent_0_child_None' '1']
 ['parent_1_child_0' '1085']
 ['parent_1_child_1' '2212']
 ['parent_1_child_2' '113']
 ['parent_1_child_3' '15']
 ['parent_1_child_4' '5']
 ['parent_2_child_0' '23']
 ['parent_2_child_1' '46']
 ['parent_2_child_2' '10']
 ['parent_3_child_0' '3']
 ['parent_3_child_1' '4']
 ['parent_3_child_2' '5']
 ['parent_3_child_3' '1']
 ['parent_4_child_1' '1']]


In [None]:
# dropar todas as linhas que tenham token_nao_encontrado_no_texto
df = df.drop(df.index[df.applymap(lambda x: x == 'token_nao_encontrado_no_texto').any(axis=1)])

  df = df.drop(df.index[df.applymap(lambda x: x == 'token_nao_encontrado_no_texto').any(axis=1)])


In [None]:
df.to_csv("../data/processed/processed_dataset.csv", index=False)