In [1]:
import transformers
import pandas as pd

from data_utils import *

In [18]:
# Global parameters:
BASELINE = True # True for baseline model, false for advanced
MODEL_NAME = "distilbert-base-uncased"
MULTILABEL = True

if MULTILABEL:
    task = 'argument-classification' # multilabel
else:
    task = 'argument-identification' # binary
    
if BASELINE:
    model_type = 'baseline'
else:
    model_type = 'advanced'
    

Importing data and applying transformation for conll format:

In [3]:
train_path = '../Data/en_ewt-up-train.conllu'
dev_path = '../Data/en_ewt-up-dev.conllu'
test_path = '../Data/en_ewt-up-test.conllu'

In [4]:
train_data = conll_transform(read_conll(train_path))
dev_data = conll_transform(read_conll(dev_path))
test_data = conll_transform(read_conll(test_path))

In [5]:
def extract_predicate_argument_feats(df):
    """
    Fuction to extract argument and predicate features from transformed
    conll format data.
    
    params:
    df: Dataframe of transformed conll data
    """
    # feature to indicate if the token is a predicate; maybe redundant
    df['is_token_predicate'] = (df['predicate'] != '_').astype(int)
    # feature for classification task 1: argument identification
    df['is_token_argument'] = (df['argument_type'].str.startswith('ARG')).astype(int)
    # feature for classification task 2: argument classification
    df['argument_label'] = df['argument_type'].apply(lambda x: x if x.startswith('ARG') else 'O')
    
    return df

In [6]:
train_data = extract_predicate_argument_feats(train_data)
dev_data = extract_predicate_argument_feats(dev_data)
test_data = extract_predicate_argument_feats(test_data)

In [7]:
# get rid of unnecessary columns
train_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)
dev_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)
test_data.drop(['lemma', 'POS','morph_type','distance_head','dep_label','dep_rel'], axis=1, inplace=True)

In [13]:
task

'argument-classification'

In [11]:
if task == 'argument-identification':
    label_list = set(train_data['is_token_argument'].tolist())
    print(label_list)
elif task == 'argument-classification':
    label_list = set(train_data['argument_label'].tolist())
    # for mapping str labels to int:
    label_mapping = {}
    for e, label in enumerate(label_list):
        label_mapping.update({label: int(e)})
    print(label_list)

{'ARGM-LVB', 'ARGM-ADV', 'ARG2', 'ARGA', 'ARGM-DIR', 'ARG5', 'O', 'ARGM-REC', 'ARGM-DIS', 'ARGM-LOC', 'ARG4', 'ARGM-CXN', 'ARGM-PRR', 'ARGM-ADJ', 'ARGM-MNR', 'ARGM-GOL', 'ARG1', 'ARGM-COM', 'ARGM-CAU', 'ARGM-PRD', 'ARGM-TMP', 'ARG3', 'ARGM-MOD', 'ARGM-EXT', 'ARGM-PRP', 'ARG0', 'ARG1-DSP', 'ARGM-NEG'}


Following function represents each sentence data (tokens, predicate/argument labels, etc) to lists, therefore,
info of each distinct sentence will be stored in designated 

In [53]:
# represent setence in a list:
def extract_sentences(df: pd.DataFrame, baseline=True):
    """
    extracts sentences from 
    """
    model_type = 'baseline' if baseline else 'advanced'  
    
    sentences = []
    arguments = []
    arg_label = []
    sentence_ids = []
    
    current_sent = []
    current_sent_arguments = []
    current_sent_arg_label = []
    
    
    for _, row in df.iterrows():
        if row['token_id'] == '1' and current_sent:
            if model_type == 'baseline':
                # add everything for baseline predicate mark at the end of sentence
                current_sent.append('[SEP]')
                current_sent.append(predicate_token)
                current_sent_arguments.append(-100)
                current_sent_arguments.append(-100)
                current_sent_arg_label.append(-100)
                current_sent_arg_label.append(-100)

            
            sentences.append(current_sent)
            arguments.append(current_sent_arguments)
            arg_label.append(current_sent_arg_label)
            sentence_ids.append(current_id)
            
            current_sent = []
            current_sent_arguments = []
            current_sent_arg_label = []
        
        if model_type == 'baseline': 
            if row['is_token_predicate'] == 1:
                predicate_token = row['token']
        
            current_sent.append(row['token'])
            current_sent_arguments.append(row['is_token_argument'])
            current_sent_arg_label.append(label_mapping[row['argument_label']])
            current_id = row['sent_id']
            
        elif model_type == 'advanced':
            
            if row['is_token_predicate'] == 1:
                # adding special token '[PREDICATE]' before predicate for advanced model
                current_sent.append('[PREDICATE]')
                current_sent.append(row['token'])
                current_sent_arguments.append(-100)
                current_sent_arguments.append(row['is_token_argument'])
                current_sent_arg_label.append(-100)
                current_sent_arg_label.append(label_mapping[row['argument_label']])
                
            else:
                current_sent.append(row['token'])
                current_sent_arguments.append(row['is_token_argument'])
                current_sent_arg_label.append(label_mapping[row['argument_label']])
                current_id = row['sent_id']
           
    return sentences, arguments, arg_label, sentence_ids  

In [54]:
sents,  arguments, arg_label, sentence_ids = extract_sentences(train_data, baseline=BASELINE)

# Create a new DataFrame with the grouped data
formatted_train = pd.DataFrame({
    'sentence_id': sentence_ids,
    'sentences': sents,
    'is_argument': arguments, # binary - is_argument
    'arg_labels': arg_label # multilabel
})

In [55]:
sents,  arguments, arg_label, sentence_ids = extract_sentences(dev_data, baseline=BASELINE)

# Create a new DataFrame with the grouped data
formatted_dev = pd.DataFrame({
    'sentence_id': sentence_ids,
    'sentences': sents,
    'is_argument': arguments, # binary - is_argument
    'arg_labels': arg_label # multilabel
})

In [56]:
sents,  arguments, arg_label, sentence_ids = extract_sentences(test_data, baseline=BASELINE)

# Create a new DataFrame with the grouped data
formatted_test = pd.DataFrame({
    'sentence_id': sentence_ids,
    'sentences': sents,
    'is_argument': arguments, # binary - is_argument
    'arg_labels': arg_label # multilabel
})

In [57]:
formatted_train.head()

Unnamed: 0,sentence_id,sentences,is_argument,arg_labels
0,weblog-juancole.com_juancole_20051126063000_EN...,"[Al, -, Zaman, :, American, forces, killed, Sh...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[6, 6, 6, 6, 6, 25, 6, 16, 6, 6, 6, 6, 6, 6, 6..."
1,weblog-juancole.com_juancole_20051126063000_EN...,"[[, This, killing, of, a, respected, cleric, w...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 6, 6, 6, 6, 6, 16, 6, 6, 6, 6, 6, 6, 6, 6,..."
2,weblog-juancole.com_juancole_20051126063000_EN...,"[DPA, :, Iraqi, authorities, announced, that, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[6, 6, 6, 25, 6, 6, 6, 6, 16, 6, 6, 6, 6, 6, 6..."
3,weblog-juancole.com_juancole_20051126063000_EN...,"[Two, of, them, were, being, run, by, 2, offic...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ..."
4,weblog-juancole.com_juancole_20051126063000_EN...,"[The, MoI, in, Iraq, is, equivalent, to, the, ...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 16, 6, 6, 6, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6,..."


Saving processed to csv because running all cells sequentially exhausts all of the system ram

In [58]:
formatted_train.to_csv(f'../Data/transformers_formatted_train_{model_type}.csv', index=False)
formatted_dev.to_csv(f'../Data/transformers_formatted_dev_{model_type}.csv', index=False)
formatted_test.to_csv(f'../Data/transformers_formatted_test_{model_type}.csv', index=False)

In [59]:
formatted_train = pd.read_csv(f'../Data/transformers_formatted_train_{model_type}.csv')
formatted_dev = pd.read_csv(f'../Data/transformers_formatted_dev_{model_type}.csv')
formatted_test = pd.read_csv(f'../Data/transformers_formatted_test_{model_type}.csv')


In [60]:
# the list columns are read as strings by pd.read_csv, thus converting it back to lists
formatted_train = fix_lists(formatted_train)
formatted_dev = fix_lists(formatted_dev)
formatted_test = fix_lists(formatted_test)

In [61]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# adding special token for advanced model:
if BASELINE == False:   
    tokenizer.add_special_tokens({'additional_special_tokens': ['[PREDICATE]']})

In [62]:
def tokenize_and_align_labels(data, multilabel, label_all_tokens = True):
    """
    Tokenizes the input examples and aligns argument labels and ids.

    Parameters:
    data: DataFrame containing tokens, sentence IDs, and argument labels/ids.

    Returns:
    list: A list of new examples with tokenized inputs and aligned labels.
    """
    sentence_lists = data['sentences'].tolist()
    sentence_ids = data['sentence_id'].tolist()
    
    # Tokenize sentences:
    tokenized_inputs = tokenizer(sentence_lists, truncation=True, is_split_into_words=True)
    
    aligned_examples = []
    
    for i, (is_arg, arg_label) in enumerate(zip(data['is_argument'], data['arg_labels'])):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        arg_ids = []
        labels = []
        for word_idx in word_ids:
            if word_idx is None:
                arg_ids.append(-100)
                labels.append(-100)
            elif word_idx != previous_word_idx:
                arg_ids.append(is_arg[word_idx])
                labels.append(arg_label[word_idx])
            else:
                arg_ids.append(is_arg[word_idx] if label_all_tokens else -100)
                labels.append(arg_label[word_idx] if label_all_tokens else -100)
            
            previous_word_idx = word_idx
        
        if multilabel:
            aligned_examples.append({
                'sentence_id': sentence_ids[i],
                'sentence': sentence_lists[i],
                'word_ids': word_ids,
                'input_ids': tokenized_inputs['input_ids'][i],
                'attention_mask': tokenized_inputs['attention_mask'][i],
                'labels': labels,
            })
        else:
            aligned_examples.append({
                'sentence_id': sentence_ids[i],
                'sentence': sentence_lists[i],
                'word_ids': word_ids,
                'input_ids': tokenized_inputs['input_ids'][i],
                'attention_mask': tokenized_inputs['attention_mask'][i],
                'labels': arg_ids,
            })
            
    return aligned_examples

In [63]:
tokenized_train = tokenize_and_align_labels(formatted_train, MULTILABEL)


In [64]:
tokenized_dev = tokenize_and_align_labels(formatted_dev, MULTILABEL)

In [65]:
tokenized_test = tokenize_and_align_labels(formatted_test, MULTILABEL)

In [66]:
print(len(tokenized_train))
print(len(tokenized_dev))
print(len(tokenized_test))

41474
5307
5210


In [67]:
tokenized_train[0].keys()

dict_keys(['sentence_id', 'sentence', 'word_ids', 'input_ids', 'attention_mask', 'labels'])

argument identifcation:

In [68]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback

model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(label_list))
# making sure that special token is added:
model.resize_token_embeddings(len(tokenizer))

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(30522, 768, padding_idx=0)

In [69]:
args = TrainingArguments(
    f"{MODEL_NAME}-finetuned-{model_type}-{task}",
    evaluation_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_strategy="epoch"
)

In [70]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [71]:
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]
)

In [72]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.2551,0.275986


TrainOutput(global_step=2593, training_loss=0.28737434860917993, metrics={'train_runtime': 279.8188, 'train_samples_per_second': 148.217, 'train_steps_per_second': 9.267, 'total_flos': 740693836646400.0, 'train_loss': 0.28737434860917993, 'epoch': 1.0})

In [73]:
def aggregate_subtoken_logits(tokenized_data, predictions):
    """
    Aggregates subtoken logits to word level for each example in a tokenized dataset.

    Parameters:
    tokenized_data: A list of tokenized data, where each list is a dictionary containing
                               'sentence' and 'word_ids'.
    predictions: A list of subtoken-level predictions, corresponding to the tokenized examples.
                        Each element in the list is an array of logits for an example.

    Returns:
    list: A list of word-level logits for each example. Each element in the list is an array of aggregated logits,
          corresponding to the words in the example.
    """
    word_level_logits = []

    for index, data in enumerate(tokenized_data):
        tokens = data['sentence']
        word_ids = data['word_ids']
        subtoken_logits = np.array(predictions[index])
        current_word_id = None
        current_word_logits = None
        sentence_logits = []

        for subtoken_index, word_id in enumerate(word_ids):
            if word_id is not None and word_id != current_word_id:
                if current_word_logits is not None:
                    sentence_logits.append(current_word_logits)

                current_word_id = word_id
                current_word_logits = subtoken_logits[subtoken_index].copy()
            elif word_id is not None:
                current_word_logits += subtoken_logits[subtoken_index]

        if current_word_logits is not None:
            sentence_logits.append(current_word_logits)

        word_level_logits.append(np.array(sentence_logits))

    return word_level_logits

In [74]:
def align_labels_with_predictions(tokenized_data):
    """
    Aligns original labels with their corresponding word-level predictions in tokenized data.

    Parameters:
    tokenized_data: A list of tokenized examples, where each example is a dictionary containing
                           'word_ids' and 'labels'. 'word_ids' should be a list of word IDs for each subtoken,
                           and 'labels' should be a list of labels for each subtoken.

    Returns:
    list: A list where each element is a list of aligned labels for the words in the corresponding tokenized example.
    """
    aligned_labels = []

    for item in tokenized_data:
        # Extract word IDs and labels, ignoring special tokens at the start and end
        word_ids = item['word_ids'][1:-1]
        original_labels = item['labels'][1:-1]

        # Aggregate labels based on word IDs
        current_word_id = None
        word_labels = []

        for word_id, label in zip(word_ids, original_labels):
            if word_id is not None and word_id != current_word_id:
                # Start of a new word
                word_labels.append(label)
                current_word_id = word_id

        aligned_labels.append(word_labels)

    return aligned_labels

In [75]:
test_predictions = trainer.predict(tokenized_test)
test_preds = test_predictions.predictions

In [76]:
aggregatted_test_preds = aggregate_subtoken_logits(tokenized_test, test_preds)
aggregatted_test_preds = [np.argmax(pred, axis=1) for pred in aggregatted_test_preds]

In [77]:
aggregatted_test_preds[0]

array([ 6,  6, 16,  6,  6,  6,  6,  6,  6])

In [78]:
aligned_true_test_labels = align_labels_with_predictions(tokenized_test)

In [80]:
aligned_true_test_labels[0]

[6, 6, 16, 6, 6, 2, 6, -100, -100]

In [81]:
def remove_baseline_indexes(predictions, labels, label_list):    

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return true_predictions, true_labels

In [87]:
if MULTILABEL:
    preds, true_labels = remove_baseline_indexes(aggregatted_test_preds, aligned_true_test_labels, 
                                                 label_list=list(label_mapping.values()))
else:
    preds, true_labels = remove_baseline_indexes(aggregatted_test_preds, aligned_true_test_labels, 
                                                 label_list=label_list)

In [88]:
from sklearn.metrics import precision_recall_fscore_support

def calculate_classification_metrics(preds, true_labels, multilabel):
    """
    Calculate precision, recall, f1 score, and macro average metrics for classification results.
    
    Parameters:
    preds: List of list of predictions from token classification
    true_labels: List of list of true labels from token classification
    return: 
    Dictionary with precision, recall, f1 score for each class and macro averages
    """
    # Flatten the predictions and true labels lists
    preds_flat = [p for sublist in preds for p in sublist]
    true_flat = [t for sublist in true_labels for t in sublist]
    
    # Extract unique classes
    if multilabel:
        classes = list(label_mapping.values())
    else:
        classes = sorted(set(true_flat))
    
    # Calculate precision, recall, and F1 score for each class
    precision, recall, f1, _ = precision_recall_fscore_support(true_flat, preds_flat, labels=classes)
    
    # Calculate macro averages
    precision_macro = np.mean(precision)
    recall_macro = np.mean(recall)
    f1_macro = np.mean(f1)
    
    # Create a dictionary to store the metrics
    metrics = {
        'classes': label_list if multilabel else classes,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'macro': {
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'f1_macro': f1_macro
        }
    }
    
    return metrics

In [89]:
results = calculate_classification_metrics(preds, true_labels, MULTILABEL)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [90]:
results

{'classes': {'ARG0',
  'ARG1',
  'ARG1-DSP',
  'ARG2',
  'ARG3',
  'ARG4',
  'ARG5',
  'ARGA',
  'ARGM-ADJ',
  'ARGM-ADV',
  'ARGM-CAU',
  'ARGM-COM',
  'ARGM-CXN',
  'ARGM-DIR',
  'ARGM-DIS',
  'ARGM-EXT',
  'ARGM-GOL',
  'ARGM-LOC',
  'ARGM-LVB',
  'ARGM-MNR',
  'ARGM-MOD',
  'ARGM-NEG',
  'ARGM-PRD',
  'ARGM-PRP',
  'ARGM-PRR',
  'ARGM-REC',
  'ARGM-TMP',
  'O'},
 'precision': array([0.        , 0.55263158, 0.62303665, 0.        , 0.        ,
        0.        , 0.9243318 , 0.        , 0.61842105, 0.6875    ,
        0.        , 0.        , 0.        , 0.75      , 0.5       ,
        0.        , 0.65750286, 0.        , 0.        , 0.        ,
        0.68965517, 0.        , 0.67213115, 0.84615385, 0.33333333,
        0.62363239, 0.        , 0.5625    ]),
 'recall': array([0.        , 0.04393305, 0.10788758, 0.        , 0.        ,
        0.        , 0.9930507 , 0.        , 0.26256983, 0.05583756,
        0.        , 0.        , 0.        , 0.15207373, 0.02816901,
        0.        

In [182]:
save_dict_to_json(results, f'../Results/{MODEL_NAME}-{model_type}-results.json')