In [59]:
import transformers
import pandas as pd

from data_utils import *

In [60]:
train_path = '../Data/en_ewt-up-train.conllu'
dev_path = '../Data/en_ewt-up-dev.conllu'
test_path = '../Data/en_ewt-up-test.conllu'

In [61]:
train_data = conll_transform(read_conll(train_path))
dev_data = conll_transform(read_conll(dev_path))
test_data = conll_transform(read_conll(test_path))

In [76]:
def extract_predicate_argument_feats(df):

    # feature to indicate if the token is a predicate; maybe redundant
    df['is_token_predicate'] = (df['predicate'] != '_').astype(int)
    # feature for classification task 1: argument identification
    df['is_token_argument'] = (df['argument_type'].str.startswith('ARG')).astype(int)
    # feature for classification task 2: argument classification
    df['argument_label'] = df['argument_type'].apply(lambda x: x if x.startswith('ARG') else 'O')
    
    return df

In [78]:
train_data = extract_predicate_argument_feats(train_data)
dev_data = extract_predicate_argument_feats(dev_data)
test_data = extract_predicate_argument_feats(test_data)

In [80]:
train_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)
dev_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)
test_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)

In [110]:
# represent setence in a list:
def extract_sentences(df: pd.DataFrame):
    """
    extracts sentences from 
    """
    sentences = []
    predicates = []
    arguments = []
    arg_label = []
    sentence_ids = []
    
    current_sent = []
    current_sent_predicates = []
    current_sent_arguments = []
    current_sent_arg_label = []
    
    
    for _, row in df.iterrows():
        if row['token_id'] == '1' and current_sent:
            # add baseline mark for predicate
            current_sent.append(baseline_mark)
            
            sentences.append(current_sent)
            predicates.append(current_sent_predicates)
            arguments.append(current_sent_arguments)
            arg_label.append(current_sent_arg_label)
            sentence_ids.append(current_id)
            
            current_sent = []
            current_sent_predicates = []
            current_sent_arguments = []
            current_sent_arg_label = []
        
        if row['is_token_predicate'] == 1:
            baseline_mark = '[SEP] ' + row['token']
        
        current_sent.append(row['token'])
        current_sent_predicates.append(row['is_token_predicate'])
        current_sent_arguments.append(row['is_token_argument'])
        current_sent_arg_label.append(row['argument_label'])
        current_id = row['sent_id']
    
           
    return sentences, predicates, arguments, arg_label, sentence_ids  

In [111]:
sents, predicates, arguments, arg_label, sentence_ids = extract_sentences(train_data)

# Create a new DataFrame with the grouped data
formatted_train = pd.DataFrame({
    'sentence_id': sentence_ids,
    'sentences': sents,
    'is_predicate': predicates, # binary - is_predicate
    'is_argument': arguments, # binary - is_argument
    'arg_labels': arg_label # multilabel
})

In [113]:
formatted_train

Unnamed: 0,sentence_id,sentences,is_predicate,is_argument,arg_labels
0,weblog-juancole.com_juancole_20051126063000_EN...,"[Al, -, Zaman, :, American, forces, killed, Sh...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[O, O, O, O, O, ARG0, O, ARG1, O, O, O, O, O, ..."
1,weblog-juancole.com_juancole_20051126063000_EN...,"[[, This, killing, of, a, respected, cleric, w...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[O, O, O, O, O, O, ARG1, O, O, O, O, O, O, O, ..."
2,weblog-juancole.com_juancole_20051126063000_EN...,"[DPA, :, Iraqi, authorities, announced, that, ...","[0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[O, O, O, ARG0, O, O, O, O, ARG1, O, O, O, O, ..."
3,weblog-juancole.com_juancole_20051126063000_EN...,"[Two, of, them, were, being, run, by, 2, offic...","[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,weblog-juancole.com_juancole_20051126063000_EN...,"[The, MoI, in, Iraq, is, equivalent, to, the, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[O, ARG1, O, O, O, ARG2, O, O, O, O, O, O, O, ..."
...,...,...,...,...,...
41469,reviews-319816-0028,"[The, employees, at, this, Sear's, are, comple...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...","[O, O, O, O, O, O, O, O, O, ARG1, O, ARGM-NEG,..."
41470,reviews-319816-0028,"[The, employees, at, this, Sear's, are, comple...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[O, O, O, O, O, O, O, O, O, ARG1, O, O, O, O, ..."
41471,reviews-319816-0028,"[The, employees, at, this, Sear's, are, comple...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
41472,reviews-319816-0028,"[The, employees, at, this, Sear's, are, comple...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [84]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [87]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [107]:
tokenizer('I [SEP] went')

{'input_ids': [101, 1045, 102, 2253, 102], 'attention_mask': [1, 1, 1, 1, 1]}

In [106]:
tokenizer([['I', '[SEP] went']])

{'input_ids': [[101, 1045, 102, 102, 2253, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1]]}

In [105]:
tokenizer('[SEP]')

{'input_ids': [101, 102, 102], 'attention_mask': [1, 1, 1]}