In [1]:
import transformers
import pandas as pd

from data_utils import *
from feature_utils import *

In [2]:
# Global parameters:
BASELINE = True # True for baseline model, false for advanced
MODEL_NAME = "distilbert-base-uncased"
MULTILABEL = True

if MULTILABEL:
    task = 'argument-classification' # multilabel
else:
    task = 'argument-identification' # binary
    
if BASELINE:
    model_type = 'baseline'
else:
    model_type = 'advanced'
    

# Preprocessing data
Importing data and applying transformation for conll format:

In [3]:
train_path = '../Data/en_ewt-up-train.conllu'
dev_path = '../Data/en_ewt-up-dev.conllu'
test_path = '../Data/en_ewt-up-test.conllu'

In [4]:
train_data = conll_transform(read_conll(train_path))
dev_data = conll_transform(read_conll(dev_path))
test_data = conll_transform(read_conll(test_path))

In [5]:
train_data = extract_predicate_argument_feats(train_data)
dev_data = extract_predicate_argument_feats(dev_data)
test_data = extract_predicate_argument_feats(test_data)

In [6]:
# get rid of unnecessary columns
train_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)
dev_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)
test_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)

In [7]:
train_data.head()

Unnamed: 0,sent_id,token_id,token,Universal_POS,space,predicate,argument_type,is_token_predicate,is_token_argument,argument_label
0,weblog-juancole.com_juancole_20051126063000_EN...,1,Al,NNP,SpaceAfter=No,_,_,0,0,O
1,weblog-juancole.com_juancole_20051126063000_EN...,2,-,HYPH,SpaceAfter=No,_,_,0,0,O
2,weblog-juancole.com_juancole_20051126063000_EN...,3,Zaman,NNP,_,_,_,0,0,O
3,weblog-juancole.com_juancole_20051126063000_EN...,4,:,:,_,_,_,0,0,O
4,weblog-juancole.com_juancole_20051126063000_EN...,5,American,JJ,_,_,_,0,0,O


In [8]:
if task == 'argument-identification':
    label_list = set(train_data['is_token_argument'].tolist())
    print(label_list)
elif task == 'argument-classification':
    label_list = set(train_data['argument_label'].tolist())
    # for mapping str labels to int:
    label_mapping = {}
    for e, label in enumerate(label_list):
        label_mapping.update({label: int(e)})
    print("argument classification labels:",label_list)

argument classification labels: {'ARGM-LVB', 'ARGM-PRR', 'ARG5', 'ARGM-NEG', 'ARGM-COM', 'ARGM-CAU', 'ARGM-PRD', 'ARGM-ADJ', 'ARGM-REC', 'ARGM-CXN', 'ARGM-GOL', 'ARGM-PRP', 'ARGA', 'ARGM-MNR', 'ARGM-ADV', 'ARG1-DSP', 'ARG4', 'ARG1', 'ARG3', 'ARG0', 'ARGM-MOD', 'ARGM-DIR', 'ARG2', 'O', 'ARGM-DIS', 'ARGM-LOC', 'ARGM-EXT', 'ARGM-TMP'}


Following function represents each sentence data (tokens, predicate/argument labels, etc) to lists where each element is a list of tokens/labels/ids of a given sentence.

In [9]:
def extract_sentences(df: pd.DataFrame, baseline, multilabel):
    """
    Extracts sentences, argument ids and labels from conll format
    and puts it into lists, so that each element of list is list of sentence tokens/labels/ids.
    Also applying either baseline or advanced model transformations for a sentence.
    
    params:
    df: DataFrame of transformed conll with predicate argument features.
    Baseline: True for baseline model else advanced model.
    
    Returns: 
    sentences: list of lists where each element is a token of given sentence. 
    arguments: list of lists of argument ids (binary).
    arg_label: list of lists of argument labels (multilabel). 
    sentence_ids: list of sentence ids
    
    """
    model_type = 'baseline' if baseline else 'advanced'  
    
    sentences = []
    arguments = [] # for argument ids (binary)
    arg_label = [] # for argument labels (multilabel)
    sentence_ids = []
    
    current_sent = []
    current_sent_arguments = []
    current_sent_arg_label = []
    
    
    for _, row in df.iterrows():
        if row['token_id'] == '1' and current_sent:
            if model_type == 'baseline':
                # add everything for baseline predicate mark at the end of sentence
                current_sent.append('[SEP]')
                current_sent.append(predicate_token)
                current_sent_arguments.append(-100)
                current_sent_arguments.append(-100)
                current_sent_arg_label.append(-100)
                current_sent_arg_label.append(-100)

            
            sentences.append(current_sent)
            arguments.append(current_sent_arguments)
            arg_label.append(current_sent_arg_label)
            sentence_ids.append(current_id)
            
            current_sent = []
            current_sent_arguments = []
            current_sent_arg_label = []
        
        if model_type == 'baseline': 
            if row['is_token_predicate'] == 1:
                predicate_token = row['token'] # saving predicate token for baseline model
        
            current_sent.append(row['token'])
            current_sent_arguments.append(row['is_token_argument'])
            
            if multilabel:
                current_sent_arg_label.append(label_mapping[row['argument_label']])
            else:
                current_sent_arg_label.append(row['argument_label'])                
            current_id = row['sent_id']
            
        elif model_type == 'advanced':
            
            if row['is_token_predicate'] == 1:
                # adding special token '[PREDICATE]' before predicate for advanced model
                current_sent.append('[PREDICATE]')
                current_sent.append(row['token'])
                current_sent_arguments.append(-100)
                current_sent_arguments.append(row['is_token_argument'])
                current_sent_arg_label.append(-100)
                if multilabel:
                    current_sent_arg_label.append(label_mapping[row['argument_label']])
                else:
                    current_sent_arg_label.append(row['argument_label'])
                
            else:
                current_sent.append(row['token'])
                current_sent_arguments.append(row['is_token_argument'])
                if multilabel:
                    current_sent_arg_label.append(label_mapping[row['argument_label']])
                else:
                    current_sent_arg_label.append(row['argument_label'])
                current_id = row['sent_id']
           
    return sentences, arguments, arg_label, sentence_ids  

In [10]:
sents,  arguments, arg_label, sentence_ids = extract_sentences(train_data, baseline=BASELINE, multilabel=MULTILABEL)

# Create a new DataFrame with the grouped data
formatted_train = pd.DataFrame({
    'sentence_id': sentence_ids,
    'sentences': sents,
    'is_argument': arguments, # binary - is_argument
    'arg_labels': arg_label # multilabel
})

In [11]:
sents,  arguments, arg_label, sentence_ids = extract_sentences(dev_data, baseline=BASELINE, multilabel=MULTILABEL)

# Create a new DataFrame with the grouped data
formatted_dev = pd.DataFrame({
    'sentence_id': sentence_ids,
    'sentences': sents,
    'is_argument': arguments, # binary - is_argument
    'arg_labels': arg_label # multilabel
})

In [12]:
sents,  arguments, arg_label, sentence_ids = extract_sentences(test_data, baseline=BASELINE, multilabel=MULTILABEL)

# Create a new DataFrame with the grouped data
formatted_test = pd.DataFrame({
    'sentence_id': sentence_ids,
    'sentences': sents,
    'is_argument': arguments, # binary - is_argument
    'arg_labels': arg_label # multilabel
})

In [13]:
formatted_train.head()

Unnamed: 0,sentence_id,sentences,is_argument,arg_labels
0,weblog-juancole.com_juancole_20051126063000_EN...,"[Al, -, Zaman, :, American, forces, killed, Sh...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[23, 23, 23, 23, 23, 19, 23, 17, 23, 23, 23, 2..."
1,weblog-juancole.com_juancole_20051126063000_EN...,"[[, This, killing, of, a, respected, cleric, w...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[23, 23, 23, 23, 23, 23, 17, 23, 23, 23, 23, 2..."
2,weblog-juancole.com_juancole_20051126063000_EN...,"[DPA, :, Iraqi, authorities, announced, that, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[23, 23, 23, 19, 23, 23, 23, 23, 17, 23, 23, 2..."
3,weblog-juancole.com_juancole_20051126063000_EN...,"[Two, of, them, were, being, run, by, 2, offic...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 2..."
4,weblog-juancole.com_juancole_20051126063000_EN...,"[The, MoI, in, Iraq, is, equivalent, to, the, ...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[23, 17, 23, 23, 23, 22, 23, 23, 23, 23, 23, 2..."


In [14]:
formatted_train.to_csv(f'../Data/transformers_formatted_train_{model_type}_{task}.csv', index=False)
formatted_dev.to_csv(f'../Data/transformers_formatted_dev_{model_type}_{task}.csv', index=False)
formatted_test.to_csv(f'../Data/transformers_formatted_test_{model_type}_{task}.csv', index=False)

In [15]:
formatted_train = pd.read_csv(f'../Data/transformers_formatted_train_{model_type}_{task}.csv')
formatted_dev = pd.read_csv(f'../Data/transformers_formatted_dev_{model_type}_{task}.csv')
formatted_test = pd.read_csv(f'../Data/transformers_formatted_test_{model_type}_{task}.csv')


In [16]:
# the list columns are read as strings by pd.read_csv, thus converting it back to lists
formatted_train = fix_lists(formatted_train)
formatted_dev = fix_lists(formatted_dev)
formatted_test = fix_lists(formatted_test)

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# adding special token for advanced model:
if BASELINE == False:   
    tokenizer.add_special_tokens({'additional_special_tokens': ['[PREDICATE]']})

In [18]:
def tokenize_and_align_labels(data, multilabel, label_all_tokens = True):
    """
    Tokenizes the input examples and aligns argument labels and ids.

    Parameters:
    data: DataFrame containing tokens, sentence IDs, and argument labels/ids.
    multilabel: True for argument classifcation else argument identification (binary).
    label_all_tokens: bool for labeling all tokens.

    Returns:
    list: A list of new examples with tokenized inputs and aligned labels.
    """
    sentence_lists = data['sentences'].tolist()
    sentence_ids = data['sentence_id'].tolist()
    
    # Tokenize sentences:
    tokenized_inputs = tokenizer(sentence_lists, truncation=True, is_split_into_words=True)
    
    aligned_examples = []
    
    for i, (is_arg, arg_label) in enumerate(zip(data['is_argument'], data['arg_labels'])):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        arg_ids = []
        labels = []
        for word_idx in word_ids:
            if word_idx is None: # set arg id and label to -100 for first and last special tokens
                arg_ids.append(-100)
                labels.append(-100)
            elif word_idx != previous_word_idx:
                arg_ids.append(is_arg[word_idx])
                labels.append(arg_label[word_idx])
            else:
                arg_ids.append(is_arg[word_idx] if label_all_tokens else -100)
                labels.append(arg_label[word_idx] if label_all_tokens else -100)
            
            previous_word_idx = word_idx
        
        if multilabel:
            aligned_examples.append({
                'sentence_id': sentence_ids[i],
                'sentence': sentence_lists[i],
                'word_ids': word_ids,
                'input_ids': tokenized_inputs['input_ids'][i],
                'attention_mask': tokenized_inputs['attention_mask'][i],
                'labels': labels,
            })
        else:
            aligned_examples.append({
                'sentence_id': sentence_ids[i],
                'sentence': sentence_lists[i],
                'word_ids': word_ids,
                'input_ids': tokenized_inputs['input_ids'][i],
                'attention_mask': tokenized_inputs['attention_mask'][i],
                'labels': arg_ids,
            })
            
    return aligned_examples

In [19]:
tokenized_train = tokenize_and_align_labels(formatted_train, MULTILABEL)

In [20]:
tokenized_dev = tokenize_and_align_labels(formatted_dev, MULTILABEL)

In [21]:
tokenized_test = tokenize_and_align_labels(formatted_test, MULTILABEL)

number of examples in each set:

In [22]:
print(len(tokenized_train))
print(len(tokenized_dev))
print(len(tokenized_test))

41474
5307
5210


In [23]:
tokenized_train[0].keys()

dict_keys(['sentence_id', 'sentence', 'word_ids', 'input_ids', 'attention_mask', 'labels'])

In [24]:
tokenized_train[0]

{'sentence_id': 'weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0001',
 'sentence': ['Al',
  '-',
  'Zaman',
  ':',
  'American',
  'forces',
  'killed',
  'Shaikh',
  'Abdullah',
  'al',
  '-',
  'Ani',
  ',',
  'the',
  'preacher',
  'at',
  'the',
  'mosque',
  'in',
  'the',
  'town',
  'of',
  'Qaim',
  ',',
  'near',
  'the',
  'Syrian',
  'border',
  '.',
  '[SEP]',
  'killed'],
 'word_ids': [None,
  0,
  1,
  2,
  2,
  3,
  4,
  5,
  6,
  7,
  7,
  8,
  9,
  10,
  11,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  22,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  None],
 'input_ids': [101,
  2632,
  1011,
  23564,
  2386,
  1024,
  2137,
  2749,
  2730,
  21146,
  28209,
  14093,
  2632,
  1011,
  2019,
  2072,
  1010,
  1996,
  14512,
  2012,
  1996,
  8806,
  1999,
  1996,
  2237,
  1997,
  1053,
  4886,
  2213,
  1010,
  2379,
  1996,
  9042,
  3675,
  1012,
  102,
  2730,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1

# Model setup and training

In [25]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback

model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(label_list))
# making sure that special token is added:
model.resize_token_embeddings(len(tokenizer))

2024-03-03 22:45:35.026430: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-03 22:45:35.873605: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2024-03-03 22:45:35.873738: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

Embedding(30522, 768, padding_idx=0)

In [26]:
args = TrainingArguments(
    f"{MODEL_NAME}-finetuned-{model_type}-{task}",
    evaluation_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_strategy="epoch"
)

In [27]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [28]:
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]
)

In [29]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.2649,0.269063
2,0.2334,0.252631
3,0.2181,0.24778
4,0.2101,0.245847
5,0.2015,0.248094
6,0.1975,0.248908
7,0.193,0.250099


Checkpoint destination directory distilbert-base-uncased-finetuned-baseline-argument-classification/checkpoint-1297 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilbert-base-uncased-finetuned-baseline-argument-classification/checkpoint-2594 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilbert-base-uncased-finetuned-baseline-argument-classification/checkpoint-3891 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilbert-base-uncased-finetuned-baseline-argument-classification/checkpoint-5188 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilbert-base-uncased-finetuned-baseline-argument-classification/checkpoint-6485 already exists and is non-empty. Saving will proceed but saved results m

TrainOutput(global_step=9079, training_loss=0.2221136002803718, metrics={'train_runtime': 2162.6311, 'train_samples_per_second': 191.776, 'train_steps_per_second': 5.997, 'total_flos': 6076192541468400.0, 'train_loss': 0.2221136002803718, 'epoch': 7.0})

# Testing model
getting predictions on a test set

In [30]:
test_predictions = trainer.predict(tokenized_test)
test_preds = test_predictions.predictions

Aggregating subtoken level logtis with tokens in the data:

In [31]:
aggregatted_test_preds = aggregate_subtoken_logits(tokenized_test, test_preds)
aggregatted_test_preds = [np.argmax(pred, axis=1) for pred in aggregatted_test_preds]

Aligns original labels/ids with their corresponding word-level predictions in tokenized data:

In [32]:
aligned_true_test_labels = align_labels_with_predictions(tokenized_test)

In [33]:
aligned_true_test_labels[0]

[23, 23, 17, 23, 23, 22, 23, -100, -100]

remove special token indexes from predictions and gold labels

In [34]:
if MULTILABEL:
    preds, true_labels = remove_special_token_indexes(aggregatted_test_preds, aligned_true_test_labels, 
                                                 label_list=list(label_mapping.values()))
else:
    preds, true_labels = remove_special_token_indexes(aggregatted_test_preds, aligned_true_test_labels, 
                                                 label_list=list(label_list))

computing classification metrics on test set predictions:

In [35]:
def calculate_classification_metrics(preds, true_labels, multilabel):
    """
    Calculate precision, recall, f1 score, and macro average metrics for classification results.
    
    Parameters:
    preds: List of list of predictions from token classification
    true_labels: List of list of true labels from token classification
    multilabel: true for argument classification, else argument identification.
    
    return: 
    Dictionary with precision, recall, f1 score for each class and macro averages
    """
    # Flatten the predictions and true labels lists
    preds_flat = [p for sublist in preds for p in sublist]
    true_flat = [t for sublist in true_labels for t in sublist]
    
    # Extract unique classes
    if multilabel:
        classes = list(label_mapping.values())
    else:
        classes = sorted(set(true_flat))
    
    # Calculate precision, recall, and F1 score for each class
    precision, recall, f1, _ = precision_recall_fscore_support(true_flat, preds_flat, labels=classes)
    
    # Calculate macro averages
    precision_macro = np.mean(precision)
    recall_macro = np.mean(recall)
    f1_macro = np.mean(f1)
    
    # Create a dictionary to store the metrics
    metrics = {
        'classes': list(label_list) if multilabel else list(classes),
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'macro': {
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'f1_macro': f1_macro
        }
    }
    
    return metrics

In [36]:
results = calculate_classification_metrics(preds, true_labels, MULTILABEL)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
results

{'classes': ['ARGM-LVB',
  'ARGM-PRR',
  'ARG5',
  'ARGM-NEG',
  'ARGM-COM',
  'ARGM-CAU',
  'ARGM-PRD',
  'ARGM-ADJ',
  'ARGM-REC',
  'ARGM-CXN',
  'ARGM-GOL',
  'ARGM-PRP',
  'ARGA',
  'ARGM-MNR',
  'ARGM-ADV',
  'ARG1-DSP',
  'ARG4',
  'ARG1',
  'ARG3',
  'ARG0',
  'ARGM-MOD',
  'ARGM-DIR',
  'ARG2',
  'O',
  'ARGM-DIS',
  'ARGM-LOC',
  'ARGM-EXT',
  'ARGM-TMP'],
 'precision': array([0.        , 0.40909091, 0.        , 0.69565217, 0.        ,
        0.35714286, 0.28571429, 0.68181818, 0.        , 0.        ,
        0.        , 0.33333333, 0.        , 0.46153846, 0.60759494,
        0.        , 0.5       , 0.65936473, 0.52173913, 0.61389961,
        0.63362069, 0.33333333, 0.55279503, 0.92759418, 0.6       ,
        0.69444444, 0.73913043, 0.67164179]),
 'recall': array([0.        , 0.26470588, 0.        , 0.15238095, 0.        ,
        0.11363636, 0.04545455, 0.20737327, 0.        , 0.        ,
        0.        , 0.05333333, 0.        , 0.04225352, 0.10041841,
        0.        

In [38]:
save_dict_to_json(results, f'../Results/{MODEL_NAME}-{model_type}-{task}results-final.json')