In [8]:
#! pip install datasets seqeval pandas transformers numpy torch ipywidgets
import pandas as pd
#!pip install datasets
from datasets import Dataset, DatasetDict

In [9]:
import torch
#!pip install transformers
#!pip install --upgrade pip
from ipywidgets import IntProgress
import ast
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, PreTrainedModel
import itertools

In [10]:
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
import numpy as np
from sklearn.model_selection import train_test_split
import string
import csv
import collections

### Load Dataset

In [11]:
 def read_csv_to_df(csv_file):
    #dataframe = pd.read_csv(csv_file, sep = ",")
    dataframe = pd.read_csv(csv_file, sep = ",", converters={'tokens': eval, 'srl_tags': eval})
    return dataframe

In [13]:
import os
tagged_file = os.path.abspath("C:/Users/dimts/Desktop/GitHub/DSP-Norm_Extractor/src/data/train_data/final_filtered_annotations.csv")
dataframe = read_csv_to_df(tagged_file)
df_to_train = dataframe[['sentence_id', 'tokens', 'srl_tags']]
df_to_train

Unnamed: 0,sentence_id,tokens,srl_tags
0,7-Capital_Requirements,"[Significant, subsidiaries, of, EU, parent, fi...","[Actor, Actor, Actor, Actor, Actor, Actor, Act..."
1,19-Capital_Requirements,"[Power, is, delegated, to, the, Commission, to...","[Object, Action, Action, Recipient, Recipient,..."
2,42-Capital_Requirements,"[For, the, purposes, of, calculating, own, fun...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,55-Capital_Requirements,"[Where, an, institution, fails, to, meet, the,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,68-Capital_Requirements,"[Verification, of, market, prices, and, model,...","[Object, Object, Object, Object, Object, Objec..."
...,...,...,...
1534,91-Food_Safety,"[The, implementing, rules, for, the, applicati...","[Object, Object, Object, Object, Object, Objec..."
1535,96-GDPR,"[That, contract, or, other, legal, act, shall,...","[O, O, O, O, O, O, O, O, ,, O, O, ,, O, Actor,..."
1536,98-Digital_Service_Act,"[Providers, of, very, large, online, platforms...","[Actor, Actor, Actor, Actor, Actor, Actor, Act..."
1537,98-Food_Safety,"[The, members, of, the, Management, Board, ,, ...","[Actor, Actor, Actor, Actor, Actor, Actor, Act..."


### Pre-process

In [14]:
 # creating new column with numbers for srl tags instead of text

## 1. create a dict to store translation srl_tags --> numbers
srl_keys={"O": 0,
          "Action": 1,
          "Actor": 2,
          "Object": 3,
          "Recipient": 4}

## 2. rename the srl_tags column to srl_tags_name
df_to_train.rename(columns={'srl_tags': 'srl_tags_name'}, inplace=True)

## 3. map the srl tags to numbers 
srl_tags_numbers = []
for index, row in df_to_train.iterrows():
    srl_tags_numbers.append([srl_keys.get(item) for item in row['srl_tags_name']])

## 4. create a new column with the numbers  
df_to_train['srl_tags'] = srl_tags_numbers

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_train.rename(columns={'srl_tags': 'srl_tags_name'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_train['srl_tags'] = srl_tags_numbers


In [15]:
# print(df_to_train)

# count role distribution in training data
role_counts = dict()
occurence_count = 0
for sentence in df_to_train['srl_tags_name']:
    occurences = collections.Counter(sentence)
    occurence_count = occurence_count + 1
    if occurences['Actor'] > 0:
        role_counts['actor'] = role_counts.setdefault('actor', 0) + 1
    if occurences['Object'] > 0:
        role_counts['object'] = role_counts.setdefault('object', 0) + 1
    if occurences['Recipient'] > 0:
        role_counts['recipient'] = role_counts.setdefault('recipient', 0) + 1
    if occurences['Action'] > 0:
        role_counts['verb'] = role_counts.setdefault('verb', 0) + 1          

print(role_counts)

{'actor': 1370, 'object': 1406, 'verb': 1536, 'recipient': 457}


### Convert to Dataset

In [16]:
dataset = Dataset.from_pandas(df_to_train)
print(dataset)

Dataset({
    features: ['sentence_id', 'tokens', 'srl_tags_name', 'srl_tags'],
    num_rows: 1539
})


In [17]:
print(type(dataset))

<class 'datasets.arrow_dataset.Dataset'>


In [18]:
# 90% train, 10% test + validation
train_testvalid = dataset.train_test_split(test_size=0.1)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

# Collect the two into a single DatasetDict
datasets = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['sentence_id', 'tokens', 'srl_tags_name', 'srl_tags'],
        num_rows: 1385
    })
    test: Dataset({
        features: ['sentence_id', 'tokens', 'srl_tags_name', 'srl_tags'],
        num_rows: 77
    })
    validation: Dataset({
        features: ['sentence_id', 'tokens', 'srl_tags_name', 'srl_tags'],
        num_rows: 77
    })
})


### Tokenization

In [19]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [20]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [21]:
 # try tokenizing with a test sentence
tokenized_input = tokenizer(["I", ",", "love", "you", "."], is_split_into_words=True)
print(tokenized_input)

{'input_ids': [101, 1045, 1010, 2293, 2017, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [22]:
# print an example sentence from our dataset
example = datasets["train"][9]
print(example["tokens"])

['The', 'Chair', 'of', 'the', 'Board', 'shall', ',', 'without', 'undue', ',', 'delay', 'inform', 'by', 'electronic', 'means', 'the', 'supervisory', 'authority', 'referred', 'to', ',', 'as', 'the', 'case', 'may', 'be', ',', 'in', 'paragraphs', '1', 'and', '2', ',', 'and', 'the', 'Commission', 'of', 'the', 'opinion', 'and', 'make', 'it', 'public', '.']


In [23]:
#try tokenizing this sentence from our dataset
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'the', 'chair', 'of', 'the', 'board', 'shall', ',', 'without', 'und', '##ue', ',', 'delay', 'inform', 'by', 'electronic', 'means', 'the', 'supervisory', 'authority', 'referred', 'to', ',', 'as', 'the', 'case', 'may', 'be', ',', 'in', 'paragraph', '##s', '1', 'and', '2', ',', 'and', 'the', 'commission', 'of', 'the', 'opinion', 'and', 'make', 'it', 'public', '.', '[SEP]']


### The real job

In [24]:
def tokenize_and_align_labels(examples, label_all_tokens = True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["srl_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

preprocessed_input = tokenize_and_align_labels(datasets['train'][:5])

In [25]:
for thing in preprocessed_input:
    print(thing)

input_ids
attention_mask
labels


In [26]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1385 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

In [27]:
print(tokenized_datasets)
tokenized_datasets.save_to_disk("dataset")

DatasetDict({
    train: Dataset({
        features: ['sentence_id', 'tokens', 'srl_tags_name', 'srl_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1385
    })
    test: Dataset({
        features: ['sentence_id', 'tokens', 'srl_tags_name', 'srl_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
    validation: Dataset({
        features: ['sentence_id', 'tokens', 'srl_tags_name', 'srl_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/1385 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/77 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/77 [00:00<?, ? examples/s]

### Fine tuning the model

In [28]:
srl_tags_set = set(itertools.chain.from_iterable(tokenized_datasets['train']['srl_tags_name']))
print(srl_tags_set)
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(srl_tags_set))

{',', ':', 'Object', 'Action', 'Actor', '.', '!', 'O', 'Recipient'}


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
#!pip install transformers[torch]
#pip install accelerate -U
#pip install accelerate>=0.20.1
batch_size = 8

args = TrainingArguments(
    output_dir=".",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    #per_device_train_batch_size=batch_size,
    #per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    weight_decay=0.01
)

Collator batches and pads the examples to the same length. Also, pads the resulting labels accordingly

In [30]:
data_collator = DataCollatorForTokenClassification(tokenizer)

Metric to use

In [31]:
#!pip install seqeval
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [32]:
#label_list = ["O", "Action", "Actor", "Object", "Recipient"]
label_list = list(srl_tags_set)
labels_str = tokenized_datasets['train']['srl_tags_name'][0]
labels_indices = [label_list.index(label) for label in labels_str]

#metric.compute(predictions=[labels_indices], references=[labels_indices])

Post-processing the predictions

In [33]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [34]:
#Replace other None values (commans, periods etc) to 0

tokenized_datasets['train'] = tokenized_datasets['train'].map(
    lambda example: {'labels': [0 if label is None else label for label in example['labels']]}
)

tokenized_datasets['validation'] = tokenized_datasets['validation'].map(
    lambda example: {'labels': [0 if label is None else label for label in example['labels']]}
)

tokenized_datasets['test'] = tokenized_datasets['test'].map(
    lambda example: {'labels': [0 if label is None else label for label in example['labels']]}
)

Map:   0%|          | 0/1385 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

In [35]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
#trainer.train()

Save and reload the model

In [36]:
trainer.save_model("./trained_model")
model = AutoModelForTokenClassification.from_pretrained("./trained_model")

### Predict for a training example

In [37]:
#seq = "The lead supervisory authority shall , without delay , communicate the relevant information on the matter to the other supervisory authorities concerned . "
seq = "It shall identify possible events or future changes in economic conditions that could have unfavourable effects on an institution's credit exposures and assess the institution's ability to withstand such changes .  "
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(seq)))
inputs = tokenizer.encode(seq, return_tensors="pt")

outputs = model(inputs)[0]
predictions = torch.argmax(outputs, dim=2)
# Get the positions of [CLS] and [SEP] in the tokenized sequence
cls_index = tokens.index('[CLS]')
sep_index = tokens.index('[SEP]')

# Set labels for [CLS] and [SEP] tokens to -100
predictions[0, cls_index] = -100
predictions[0, sep_index] = -100

In [38]:
print(tokens, " / ")
print(predictions)

['[CLS]', 'it', 'shall', 'identify', 'possible', 'events', 'or', 'future', 'changes', 'in', 'economic', 'conditions', 'that', 'could', 'have', 'un', '##fa', '##vo', '##urable', 'effects', 'on', 'an', 'institution', "'", 's', 'credit', 'exposure', '##s', 'and', 'assess', 'the', 'institution', "'", 's', 'ability', 'to', 'withstand', 'such', 'changes', '.', '[SEP]']  / 
tensor([[-100,    2,    1,    1,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    0,    1,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    0, -100]])


In [40]:
print(tokenized_datasets['train']['tokens'][2])
print(tokenized_datasets['train']['input_ids'][2])

['For', 'exposures', 'to', 'corporates', 'situated', 'in', 'the', 'Union', 'and', 'having', 'consolidated', 'sales', 'and', 'consolidated', 'assets', 'of', 'less', 'than', 'EUR', '500', 'million', ',', 'institutions', 'may', 'choose', 'to', 'consistently', 'set', 'M', 'as', 'set', 'out', 'in', 'paragraph', '1', 'instead', 'of', 'applying', 'paragraph', '2', '.']
[101, 2005, 7524, 2015, 2000, 5971, 2015, 4350, 1999, 1996, 2586, 1998, 2383, 10495, 4341, 1998, 10495, 7045, 1997, 2625, 2084, 7327, 2099, 3156, 2454, 1010, 4896, 2089, 5454, 2000, 10862, 2275, 1049, 2004, 2275, 2041, 1999, 20423, 1015, 2612, 1997, 11243, 20423, 1016, 1012, 102]


In [39]:
for sequence, labels in zip(tokenized_datasets['test']['tokens'], tokenized_datasets['test']['labels']):
    sequence = ' '.join(sequence)
    sequence = sequence.translate(str.maketrans('', '', string.punctuation))
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
    inputs = tokenizer.encode(sequence, return_tensors="pt")

    outputs = model(inputs)[0]
    predictions = torch.argmax(outputs, dim=2)

    cls_index = tokens.index('[CLS]')
    sep_index = tokens.index('[SEP]')
    
    # Set labels for [CLS] and [SEP] tokens to -100
    predictions[0, cls_index] = -100
    predictions[0, sep_index] = -100
    
    print(tokens)
    print(inputs) #, '\n', [(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())], '\n')
    print(predictions)
    print("\n\n")
    break

['[CLS]', 'where', 'requests', 'from', 'a', 'data', 'subject', 'are', 'manifest', '##ly', 'un', '##founded', 'or', 'excessive', 'in', 'particular', 'because', 'of', 'their', 'repetitive', 'character', 'the', 'controller', 'may', 'either', 'charge', 'a', 'reasonable', 'fee', 'taking', 'into', 'account', 'the', 'administrative', 'costs', 'of', 'providing', 'the', 'information', 'or', 'communication', 'or', 'taking', 'the', 'action', 'requested', 'or', '[SEP]']
tensor([[  101,  2073, 11186,  2013,  1037,  2951,  3395,  2024, 19676,  2135,
          4895, 21001,  2030, 11664,  1999,  3327,  2138,  1997,  2037, 23563,
          2839,  1996, 11486,  2089,  2593,  3715,  1037,  9608,  7408,  2635,
          2046,  4070,  1996,  3831,  5366,  1997,  4346,  1996,  2592,  2030,
          4807,  2030,  2635,  1996,  2895,  7303,  2030,   102]])
tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    2,    2