<a href="https://colab.research.google.com/github/shitkov/courses/blob/master/transformers/transformers_shitkov_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Information about the submission

## 1.1 Name and number of the assignment 

Semantic Role Labelling - #1

## 1.2 Student name

Konstantin Shitkov

## 1.3 Codalab user ID

shitkov

## 1.4 Additional comments

-

# 2. Technical Report

## 2.1 Methodology 

Baseline: LSTM

Experiments:
1.   BERT
1.   BERT + BiLSTM
1.   BERT + CRF
1.   BERT + BiLSTM + CRF
1.   RoBERTa + BiLSTM
1.   RoBERTa + BiLSTM + CRF

## 2.2 Discussion of results

Baseline: LSTM

Experiments:
1.   BERT: 0.67
1.   BERT + BiLSTM: 0.69
1.   BERT + CRF: 0.72
1.   BERT + BiLSTM + CRF: 0.71
1.   RoBERTa + BiLSTM: 0.74
1.   RoBERTa + BiLSTM + CRF: 0.73

Лучшее качество получалось, если учить батчами по 16, но с CRF лучше вышло на единичных предложениях без паддингов - возможно в батчах я криво маскировал. Ну и RoBERTa оказалась на пару пунктов лучше BERT.

# 3. Preparation

## 3.1 Download the data

In [None]:
%%capture
!wget https://raw.githubusercontent.com/shitkov/courses/master/transformers/dev.tsv
!wget https://raw.githubusercontent.com/shitkov/courses/master/transformers/evaluate_f1_partial.py

!wget https://raw.githubusercontent.com/s-nlp/semantic-role-labelling/main/train.tsv
!wget https://raw.githubusercontent.com/s-nlp/semantic-role-labelling/main/test_no_answers.tsv
!wget https://raw.githubusercontent.com/s-nlp/semantic-role-labelling/main/evaluation/f1_score_partial.py

## 3.1 Requirements

In [None]:
!pip install transformers -q
!pip install pytorch-crf -q

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from tqdm import tqdm
from sklearn.metrics import f1_score

from evaluate_f1_partial import evaluate_f1

from transformers import AutoTokenizer, RobertaModel, BertModel

import matplotlib.pyplot as plt

from torchcrf import CRF

## 3.3 Preprocessing 

In [None]:
data_train_path = '/content/train.tsv'
data_valid_path = '/content/dev.tsv'
data_test_path = '/content/test_no_answers.tsv'

In [None]:
def read_dataset(filename, splitter="\t"):
    data = []
    sentence = []
    tags = []
    with open(filename) as f:
        for line in f:
            if not line.isspace():
                word, tag = line.split(splitter)
                sentence.append(word)
                tags.append(tag.strip())
            else:
                data.append((sentence, tags))
                sentence = []
                tags = []
    return data

In [None]:
training_data = read_dataset(data_train_path)
valid_data = read_dataset(data_valid_path)
test_data = read_dataset(data_test_path, splitter="\n")

In [None]:
MODEL_NAME = 'xlm-roberta-large'

EPOCHS = 3
BATCH_SIZE = 16
LR=1e-5

temp_path = '/content/ans.tsv'

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 4. BERT + BiLSTM

## 4.1 Data preparation

In [None]:
tag_to_ix = {
    "O": 0,
    "B-Object": 1,
    "I-Object": 2,
    "B-Aspect": 3,
    "I-Aspect": 4,
    "B-Predicate": 5,
    "I-Predicate": 6
}

idx_to_tag = dict(map(reversed, tag_to_ix.items()))

tagset_size = len(tag_to_ix)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def tokenize_and_align_labels(words, tags, tokenizer, tag_to_ix, label_all_tokens=False):
    if BATCH_SIZE > 1:
        padding = 'max_length'
    else:
        padding = 'longest'
        
    tokenized_inputs = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors='pt',
        max_length=128,
        padding=padding
    )
    word_ids = tokenized_inputs.word_ids()
    
    if tags[0] != '':
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tags[word_idx])
            else:
                label_ids.append(tags[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [tag_to_ix[idx] if isinstance(idx, str) else idx for idx in label_ids]
        tokenized_inputs["labels"] = torch.LongTensor(label_ids)
    word_ids = [i if i is not None else -1 for i in word_ids]    
    tokenized_inputs["word_ids"] = word_ids
    return tokenized_inputs

In [None]:
class NERDataset(Dataset):

    def __init__(self, data, tokenizer, tag_to_ix, label_all_tokens, train):
        self.texts, self.labels = zip(*data)
        self.tokenizer = tokenizer
        self.tag_to_ix = tag_to_ix
        self.label_all_tokens=label_all_tokens
        self.train = train
      
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        if self.train:
            return tokenize_and_align_labels(
                self.texts[index],
                self.labels[index],
                self.tokenizer,
                self.tag_to_ix,
                self.label_all_tokens
                )
        else:
            return self.texts[index], tokenize_and_align_labels(
                self.texts[index],
                self.labels[index],
                self.tokenizer,
                self.tag_to_ix,
                self.label_all_tokens
                )

In [None]:
train_dataset = NERDataset(
    data=training_data,
    tokenizer=tokenizer,
    tag_to_ix=tag_to_ix,
    label_all_tokens=False,
    train=True
)

In [None]:
valid_dataset = NERDataset(
    data=valid_data,
    tokenizer=tokenizer,
    tag_to_ix=tag_to_ix,
    label_all_tokens=False,
    train=False
)

In [None]:
test_dataset = NERDataset(
    data=test_data,
    tokenizer=tokenizer,
    tag_to_ix=tag_to_ix,
    label_all_tokens=False,
    train=False
)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

## 4.2 Model

In [None]:
class BERT_NER(nn.Module):

    def __init__(self, model_name, tagset_size):
        super(BERT_NER, self).__init__()
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.bert = RobertaModel.from_pretrained(model_name)
        self.lstm = nn.LSTM(input_size=1024, hidden_size=1024//2, num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(1024, tagset_size)
        
    def forward(self, input_ids, attention_mask, labels=None):
        x = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
            ).last_hidden_state
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

In [None]:
def get_token_idx(word_ids):
    idx_list = []
    for i, idx in enumerate(word_ids):
        if idx != -1 and idx not in word_ids[:i]:
            idx_list.append(i)
    return idx_list

In [None]:
def predict_bert(model, dataset, idx_to_tag, filename):
    with open(filename, "w") as w:
        with torch.no_grad():
            for words, data in tqdm(dataset):
                input_ids = data['input_ids'].to(device)
                attention_mask = data['attention_mask'].to(device)
                # labels = torch.unsqueeze(data['labels'], 0).to(device)
                word_ids = data['word_ids']
                tag_scores = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    )
                tag_scores = tag_scores.argmax(dim=-1)
                tags_id = torch.squeeze(tag_scores)[get_token_idx(word_ids)]
                tags = [idx_to_tag[int(i)] for i in tags_id]
                for i, y in zip(words, tags):
                    w.write(f"{i}\t{y}\n")
                w.write("\n")

## 4.3 Train loop

In [None]:
model = BERT_NER(MODEL_NAME, tagset_size)
model.train();
model.to(device);

In [None]:
loss_function = nn.CrossEntropyLoss().to(device)
optimizer = AdamW(model.parameters(), lr=LR)

In [None]:
f1_list = []
for epoch in range(EPOCHS):
    for data in tqdm(train_dataloader):
        model.zero_grad()
        input_ids = data['input_ids'].to(device)
        attention_mask=data['attention_mask'].to(device)
        labels = data['labels'].to(device)
        dim0 = input_ids.shape[0]
        dim1 = input_ids.shape[2]
        output = model(
            input_ids=input_ids.reshape(dim0, dim1),
            attention_mask=attention_mask.reshape(dim0, dim1),
            labels = labels
            )
        loss = loss_function(torch.permute(output, (0, 2, 1)), labels)
        loss.backward()
        optimizer.step()
    predict_bert(model, valid_dataset, idx_to_tag, filename=temp_path)
    try:
        f1_avg_strict = evaluate_f1(data_valid_path, temp_path, 'results_ans.txt', True)
        filename_dev = 'dev_' + str(f1_avg_strict) + '.tsv'
        predict_bert(model, valid_dataset, idx_to_tag, filename=filename_dev)
        filename_test = 'test_' + str(f1_avg_strict) + '.tsv'
        predict_bert(model, test_dataset, idx_to_tag, filename=filename_test)
        f1_list.append(f1_avg_strict)
    except:
        pass

In [None]:
plt.plot(f1_list)
plt.show()

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

# 5. BERT + BiLSTM + CRF

In [None]:
tag_to_ix = {
    "O": 0,
    "B-Object": 1,
    "I-Object": 2,
    "B-Aspect": 3,
    "I-Aspect": 4,
    "B-Predicate": 5,
    "I-Predicate": 6,
    "START": 7,
    "STOP": 8
}

In [None]:
idx_to_tag = dict(map(reversed, tag_to_ix.items()))

In [None]:
tagset_size = len(tag_to_ix)

In [None]:
def tokenize_and_align_labels_crf(words, tags, tokenizer, tag_to_ix):
    if BATCH_SIZE > 1:
        padding = 'max_length'
    else:
        padding = 'longest'
        
    tokenized_inputs = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors='pt',
        max_length=128,
        padding=padding
    )
    word_ids = tokenized_inputs.word_ids()
    if tags[0] != '':
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(tag_to_ix['STOP'])
            elif word_idx != previous_word_idx:
                label_ids.append(tags[word_idx])
            else:
                label_ids.append(tags[word_idx])
            previous_word_idx = word_idx

        label_ids = [tag_to_ix[idx] if isinstance(idx, str) else idx for idx in label_ids]
        label_ids[0] = tag_to_ix['START']
        tokenized_inputs["labels"] = torch.LongTensor(label_ids)
        
    word_ids = [i if i is not None else -1 for i in word_ids]
    tokenized_inputs["word_ids"] = word_ids
    return tokenized_inputs

In [None]:
class NERDatasetCRF(Dataset):

    def __init__(self, data, tokenizer, tag_to_ix, train):
        self.texts, self.labels = zip(*data)
        self.tokenizer = tokenizer
        self.tag_to_ix = tag_to_ix
        self.train = train
      
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        if self.train:
            return tokenize_and_align_labels_crf(
                self.texts[index],
                self.labels[index],
                self.tokenizer,
                self.tag_to_ix
                )
        else:
            return self.texts[index], tokenize_and_align_labels_crf(
                self.texts[index],
                self.labels[index],
                self.tokenizer,
                self.tag_to_ix
                )

In [None]:
class BERT_NER_CRF(nn.Module):

    def __init__(self, model_name, tagset_size):
        super(BERT_NER_CRF, self).__init__()
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.bert = RobertaModel.from_pretrained(model_name)
        self.lstm = nn.LSTM(input_size=1024, hidden_size=1024//2, num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(1024, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)
        
    def forward(self, input_ids, attention_mask, labels=None):
        x = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
            ).last_hidden_state
        x, _ = self.lstm(x)
        x = self.fc(x)
        output = self.crf.decode(x, mask=attention_mask.type(torch.ByteTensor).to(device))
        if labels is not None:
            loss = self.crf(x, torch.unsqueeze(labels, 0), mask=attention_mask.type(torch.ByteTensor).to(device)) * (-1)
            return loss
        return output

In [None]:
def get_token_idx(word_ids):
    idx_list = []
    for i, idx in enumerate(word_ids):
        if idx != -1 and idx not in word_ids[:i]:
            idx_list.append(i)
    return idx_list

In [None]:
def predict_bert_crf(model, dataset, idx_to_tag, filename):
    with open(filename, "w") as w:
        with torch.no_grad():
            for words, data in tqdm(dataset):
                input_ids = data['input_ids'].to(device)
                attention_mask = data['attention_mask'].to(device)
                word_ids = data['word_ids']
                tag_scores = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    )
                tags_id = torch.squeeze(torch.LongTensor(tag_scores))[get_token_idx(word_ids)]
                tags = [idx_to_tag[int(i)] for i in tags_id]
                for i, y in zip(words, tags):
                    w.write(f"{i}\t{y}\n")
                w.write("\n")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
train_dataset = NERDatasetCRF(
    data=training_data,
    tokenizer=tokenizer,
    tag_to_ix=tag_to_ix,
    train=True
)

In [None]:
valid_dataset = NERDatasetCRF(
    data=valid_data,
    tokenizer=tokenizer,
    tag_to_ix=tag_to_ix,
    train=False
)

In [None]:
test_dataset = NERDatasetCRF(
    data=test_data,
    tokenizer=tokenizer,
    tag_to_ix=tag_to_ix,
    train=False
)

In [None]:
model = BERT_NER_CRF(MODEL_NAME, tagset_size)

In [None]:
model.train();
model.to(device);

In [None]:
optimizer = AdamW(model.parameters(), lr=LR)

In [None]:
f1_list = []
for epoch in range(EPOCHS):
    for data in tqdm(train_dataset):
        model.zero_grad()
        input_ids = data['input_ids'].to(device)
        attention_mask=data['attention_mask'].to(device)
        labels = data['labels'].to(device)
        loss = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels = labels
            )
        loss.backward()
        optimizer.step()
    predict_bert_crf(model, valid_dataset, idx_to_tag, filename=temp_path)
    try:
        f1_avg_strict = evaluate_f1(data_valid_path, temp_path, 'results_ans.txt', True)
        filename_dev = 'dev_' + str(f1_avg_strict) + '.tsv'
        predict_bert_crf(model, valid_dataset, idx_to_tag, filename=filename_dev)
        filename_test = 'test_' + str(f1_avg_strict) + '.tsv'
        predict_bert_crf(model, test_dataset, idx_to_tag, filename=filename_test)
        f1_list.append(f1_avg_strict)
    except:
        pass

In [None]:
words, data = test_dataset[0]

In [None]:
input_ids = data['input_ids'].to(device)

In [None]:
attention_mask = data['attention_mask'].to(device)

In [None]:
word_ids = data['word_ids']

In [None]:
word_ids = [i if i is not None else -1 for i in word_ids]

In [None]:
tag_scores = model(
    input_ids=input_ids,
    attention_mask=attention_mask,
    )

In [None]:
tags_id = torch.squeeze(torch.LongTensor(tag_scores))[get_token_idx(word_ids)]

In [None]:
tags = [idx_to_tag[int(i)] for i in tags_id]