In [1]:
import transformers
import pandas as pd

from data_utils import *

Importing data and applying transformation for conll format:

In [2]:
train_path = '../Data/en_ewt-up-train.conllu'
dev_path = '../Data/en_ewt-up-dev.conllu'
test_path = '../Data/en_ewt-up-test.conllu'

In [3]:
train_data = conll_transform(read_conll(train_path))
dev_data = conll_transform(read_conll(dev_path))
test_data = conll_transform(read_conll(test_path))

In [4]:
def extract_predicate_argument_feats(df):
    """
    Fuction to extract argument and predicate features from transformed
    conll format data.
    
    params:
    df: Dataframe of transformed conll data
    """
    # feature to indicate if the token is a predicate; maybe redundant
    df['is_token_predicate'] = (df['predicate'] != '_').astype(int)
    # feature for classification task 1: argument identification
    df['is_token_argument'] = (df['argument_type'].str.startswith('ARG')).astype(int)
    # feature for classification task 2: argument classification
    df['argument_label'] = df['argument_type'].apply(lambda x: x if x.startswith('ARG') else 'O')
    
    return df

In [5]:
train_data = extract_predicate_argument_feats(train_data)
dev_data = extract_predicate_argument_feats(dev_data)
test_data = extract_predicate_argument_feats(test_data)

In [6]:
# get rid of unnecessary columns
train_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)
dev_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)
test_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)

Following function represents each sentence data (tokens, predicate/argument labels, etc) to lists, therefore,
info of each distinct sentence will be stored in designated 

In [7]:
# represent setence in a list:
def extract_sentences(df: pd.DataFrame):
    """
    extracts sentences from 
    """
    sentences = []
    predicates = []
    arguments = []
    arg_label = []
    sentence_ids = []
    
    current_sent = []
    current_sent_predicates = []
    current_sent_arguments = []
    current_sent_arg_label = []
    
    
    for _, row in df.iterrows():
        if row['token_id'] == '1' and current_sent:
            # add everything for baseline predicate mark at the end of sentence
            current_sent.append('[SEP]')
            current_sent.append(predicate_token)
            current_sent_predicates.append(0)
            current_sent_predicates.append(1)
            current_sent_arguments.append(-100)
            current_sent_arguments.append(-100)
            current_sent_arg_label.append(-100)
            current_sent_arg_label.append(-100)
            
            
            sentences.append(current_sent)
            predicates.append(current_sent_predicates)
            arguments.append(current_sent_arguments)
            arg_label.append(current_sent_arg_label)
            sentence_ids.append(current_id)
            
            current_sent = []
            current_sent_predicates = []
            current_sent_arguments = []
            current_sent_arg_label = []
        
        if row['is_token_predicate'] == 1:
            predicate_token = row['token']
        
        current_sent.append(row['token'])
        current_sent_predicates.append(row['is_token_predicate'])
        current_sent_arguments.append(row['is_token_argument'])
        current_sent_arg_label.append(row['argument_label'])
        current_id = row['sent_id']
    
           
    return sentences, predicates, arguments, arg_label, sentence_ids  

In [8]:
sents, predicates, arguments, arg_label, sentence_ids = extract_sentences(train_data)

# Create a new DataFrame with the grouped data
formatted_train = pd.DataFrame({
    'sentence_id': sentence_ids,
    'sentences': sents,
    'is_predicate': predicates, # binary - is_predicate
    'is_argument': arguments, # binary - is_argument
    'arg_labels': arg_label # multilabel
})

In [9]:
sents, predicates, arguments, arg_label, sentence_ids = extract_sentences(dev_data)

# Create a new DataFrame with the grouped data
formatted_dev = pd.DataFrame({
    'sentence_id': sentence_ids,
    'sentences': sents,
    'is_predicate': predicates, # binary - is_predicate
    'is_argument': arguments, # binary - is_argument
    'arg_labels': arg_label # multilabel
})

In [10]:
sents, predicates, arguments, arg_label, sentence_ids = extract_sentences(test_data)

# Create a new DataFrame with the grouped data
formatted_test = pd.DataFrame({
    'sentence_id': sentence_ids,
    'sentences': sents,
    'is_predicate': predicates, # binary - is_predicate
    'is_argument': arguments, # binary - is_argument
    'arg_labels': arg_label # multilabel
})

In [11]:
formatted_train.head()

Unnamed: 0,sentence_id,sentences,is_predicate,is_argument,arg_labels
0,weblog-juancole.com_juancole_20051126063000_EN...,"[Al, -, Zaman, :, American, forces, killed, Sh...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[O, O, O, O, O, ARG0, O, ARG1, O, O, O, O, O, ..."
1,weblog-juancole.com_juancole_20051126063000_EN...,"[[, This, killing, of, a, respected, cleric, w...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[O, O, O, O, O, O, ARG1, O, O, O, O, O, O, O, ..."
2,weblog-juancole.com_juancole_20051126063000_EN...,"[DPA, :, Iraqi, authorities, announced, that, ...","[0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[O, O, O, ARG0, O, O, O, O, ARG1, O, O, O, O, ..."
3,weblog-juancole.com_juancole_20051126063000_EN...,"[Two, of, them, were, being, run, by, 2, offic...","[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,weblog-juancole.com_juancole_20051126063000_EN...,"[The, MoI, in, Iraq, is, equivalent, to, the, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[O, ARG1, O, O, O, ARG2, O, O, O, O, O, O, O, ..."


Saving processed to csv because running all cells sequentially exhausts all of the system ram

In [12]:
formatted_train.to_csv('../Data/transformers_formatted_train.csv', index=False)
formatted_dev.to_csv('../Data/transformers_formatted_dev.csv', index=False)
formatted_test.to_csv('../Data/transformers_formatted_test.csv', index=False)


In [2]:
formatted_train = pd.read_csv('../Data/transformers_formatted_train.csv')
formatted_dev = pd.read_csv('../Data/transformers_formatted_dev.csv')
formatted_test = pd.read_csv('../Data/transformers_formatted_test.csv')

In [3]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [4]:
def tokenize_and_align_labels(data):
    """
    Tokenizes the input examples and aligns argument labels and ids.

    Parameters:
    data: DataFrame containing tokens, sentence IDs, and argument labels/ids.

    Returns:
    list: A list of new examples with tokenized inputs and aligned labels.
    """
    sentence_lists = data['sentences'].tolist()
    sentence_ids = data['sentence_id'].tolist()
    
    # Tokenize sentences:
    tokenized_inputs = tokenizer(sentence_lists, truncation=True, is_split_into_words=True)
    
    aligned_examples = []
    
    for i, (is_arg, arg_label) in enumerate(zip(data['is_argument'], data['arg_labels'])):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        arg_ids = []
        labels = []
        for word_idx in word_ids:
            if word_idx is None:
                arg_ids.append(-100)
                labels.append(-100)
            elif word_idx != previous_word_idx:
                arg_ids.append(is_arg[word_idx])
                labels.append(arg_label[word_idx])
            else:
                arg_ids.append(is_arg[word_idx])
                labels.append(arg_label[word_idx])
            
            previous_word_idx = word_idx
            
        aligned_examples.append({
            'sentence_id': sentence_ids[i],
            'sentence': sentence_lists[i],
            'word_ids': word_ids,
            'is_argument': arg_ids,
            'argument_labels': labels,
        })
    return aligned_examples
    
    

In [5]:
tokenized_train = tokenize_and_align_labels(formatted_train)


IndexError: list index out of range

In [None]:
tokenized_train[0]

In [None]:
tokenized_dev = tokenize_and_align_labels(formatted_dev)


In [None]:
tokenized_test = tokenize_and_align_labels(formatted_test)

In [None]:
print(len(tokenized_train))
print(len(tokenized_dev))
print(len(tokenized_test))

In [None]:
tokenized_train[0]

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=2)

In [None]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-negation-scope",
    evaluation_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_strategy="epoch"
)