In [2]:
import conllu
import pandas as pd
from tqdm import tqdm

http://www.nilc.icmc.usp.br/semanticnlp/index.php?id=index&id_sub=principal&dir_sub=includes/projects/propbankbr&dir=includes/projects/propbankbr&lang=pt-br#palmeretal2005

ID: Identificador do token na sentença. (Ex.: 1, 2, 3, ...)

FORM: Forma do token no texto. (Ex.: Brasília, Pesquisa_Datafolha)

LEMMA: Lema ou forma canônica do token. (Ex.: Brasília, Pesquisa_Datafolha)

CPOSTAG: Tag de parte do discurso grosseira (coarse-grained part-of-speech tag). (Ex.: PROP, N, V-PCP)

POSTAG: Tag de parte do discurso detalhada (fine-grained part-of-speech tag). (Ex.: F|S, PR|3S|IND)

FEATS: Características morfológicas do token. (Ex.: (S*, (FCL*)

HEAD: Cabeça do token (geralmente um identificador de outro token ao qual 
este está ligado sintaticamente). (Ex.: (FCL*)

DEPREL: Relação de dependência entre o token e sua cabeça. (Ex.: (FCL(NP*)

PHEAD: Cabeça de projeto (para anotações projetadas ou sugeridas). (Ex.: -)

PDEPREL: Relação de dependência de projeto. (Ex.: -)

SRL: Anotação de papel semântico, que identifica o papel do token em relação ao verbo principal (Ex.: (A0*)

Coref: Informação de co-referência, que conecta entidades referidas de maneiras diferentes ao longo do texto. (Ex.: (AM-ADV*)

In [3]:
path = '../data/raw/propbankbr_v1.1_conll/PropBankBr_v1.1_Const.conll.txt'

# Open the plain text file for reading; assign under 'data'
with open(path, mode="r", encoding="utf-8") as data:
    
    # Read the file contents and assign under 'annotations'
    annotations = data.read()

In [5]:
len(annotations)

12030114

In [20]:
from nltk.corpus.reader.xmldocs import XMLCorpusReader
from nltk.corpus.reader.propbank import PropbankCorpusReader

In [25]:
teste1 = XMLCorpusReader("../data/raw/", "PropBank.Br v.2.xml")

In [28]:
import xml.etree.ElementTree as ET
tree = ET.parse("../data/raw/PropBank.Br v.2.xml")
root = tree.getroot()

In [None]:
import xml.etree.ElementTree as ET
tree = ET.parse("../data/raw/PropBank.Br v.2.xml")
root = tree.getroot()

In [15]:
import pandas as pd

def read_conll(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Descobrir o número máximo de colunas
    max_columns = 0
    for line in lines:
        if line.strip():
            parts = line.split("\t")
            if len(parts) > max_columns:
                max_columns = len(parts)
    
    # Definir as colunas de acordo com o número máximo encontrado
    columns = ["ID", "FORM", "LEMMA", "CPOSTAG", "POSTAG", "FEATS", "HEAD", "DEPREL", "PHEAD", "PDEPREL", "SRL", "Coref"] + [f"Column{i+13}" for i in range(max_columns - 12)]
    
    data = []
    sentence_id = 0
    for line in lines:
        if line.strip():
            parts = line.split("\t")
            # Substituir "-" por None e preencher com None até o número máximo de colunas
            parts = [part if part != "-" else None for part in parts]
            while len(parts) < max_columns:
                parts.append(None)
            parts.append(sentence_id)
            data.append(parts)
        else:
            sentence_id += 1

    df = pd.DataFrame(data, columns=columns + ["SentenceID"])
    return df

df = read_conll(path)
print(df)

          ID                       FORM                      LEMMA  \
0      1      Brasília                   Brasília                    
1      2      Pesquisa_Datafolha         Pesquisa_Datafolha          
2      3      publicada                  publicar                    
3      4      hoje                       hoje                        
4      5      revela                     revelar                     
...      ...                        ...                        ...   
69755  31     de                         de                          
69756  32     J.R.Duran                  J.R.Duran                   
69757  33     ,                          -                           
69758  34     Alexandra_Brochen          Alexandra_Brochen           
69759  35     .                          -                           

          CPOSTAG           POSTAG       FEATS        HEAD           DEPREL  \
0      PROP        F|S              (S*         (FCL*       (FCL(NP*)         
1

In [16]:
df.head()

Unnamed: 0,ID,FORM,LEMMA,CPOSTAG,POSTAG,FEATS,HEAD,DEPREL,PHEAD,PDEPREL,SRL,Coref,Column13,Column14,Column15,Column16,Column17,SentenceID
0,1,Brasília,Brasília,PROP,F|S,(S*,(FCL*,(FCL(NP*),-,-,*,*,*,* \n,,,,0
1,2,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,*,*,(NP*,-,-,(A0*,*,*,* \n,,,,0
2,3,publicada,publicar,V-PCP,F|S,(S*,(ICL*,(ICL(VP*),-,-,*,*,*,* \n,,,,0
3,4,hoje,hoje,ADV,-,*),*),(ADVP*))),-,-,*),*,*,* \n,,,,0
4,5,revela,revelar,V-FIN,PR|3S|IND,*,*,(VP*),01,revelar,(V*),*,*,* \n,,,,0


In [17]:

list_rows = []
for sent_id in tqdm(df.SentenceID.unique()):
    
    df_sent = df[df.SentenceID == sent_id]
    
    root = df_sent[df_sent.SRL.apply(lambda x: x.replace(" ", "")) == "(V*)"]
    
    df_sent.drop(root.index, inplace = True)
    
    root = root.squeeze()
    
    parent_tk = root['FORM']
    parent_lemma = root['LEMMA']
    parent_pos = root['CPOSTAG']
    
    new_row = {}
    for idx, row in df_sent.iterrows():
        
        new_row['parent'] = parent_tk
        new_row['parent_lemma'] = parent_lemma
        new_row['parent_pos'] = parent_pos
        
        new_row['child'] = row['FORM']
        new_row['child_lemma'] = row['LEMMA']
        new_row['child_pos'] = row['CPOSTAG']
        
        new_row['label'] = row['SRL']
        
        new_row['sentence_id'] = row['SentenceID']
        
        list_rows.append(new_row)
        
        
        
df_final = pd.DataFrame(list_rows)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sent.drop(root.index, inplace = True)
100%|██████████| 3348/3348 [00:06<00:00, 502.64it/s]


In [18]:
df_final

Unnamed: 0,parent,parent_lemma,parent_pos,child,child_lemma,child_pos,label,sentence_id
0,revela,revelar,V-FIN,.,-,PU,*,0
1,revela,revelar,V-FIN,.,-,PU,*,0
2,revela,revelar,V-FIN,.,-,PU,*,0
3,revela,revelar,V-FIN,.,-,PU,*,0
4,revela,revelar,V-FIN,.,-,PU,*,0
...,...,...,...,...,...,...,...,...
68086,surpreendeu,surpreender,V-FIN,.,-,PU,*,3347
68087,surpreendeu,surpreender,V-FIN,.,-,PU,*,3347
68088,surpreendeu,surpreender,V-FIN,.,-,PU,*,3347
68089,surpreendeu,surpreender,V-FIN,.,-,PU,*,3347


In [19]:
df_final.label.value_counts()

label
*               40180
*         \n    25991
*)        \n     1437
*)                347
(A1*)     \n       56
(V*)      \n       30
(A2*)     \n       15
(AM-MNR*) \n       12
(A0*)     \n        9
(AM-NEG*) \n        9
(AM-TMP*) \n        5
Name: count, dtype: int64

In [20]:
df_final.label = df_final.label.apply(lambda x: "arg1" if "A1" in x else "arg2" if "A2" in x else "arg0" if "A0" in x else "nao_ha_arg")

In [21]:
df_final.label.value_counts()

label
nao_ha_arg    68011
arg1             56
arg2             15
arg0              9
Name: count, dtype: int64