In [5]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [18]:
from os.path import join, exists
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import torch

## Downloading Data

In [11]:
path_train = 'https://raw.githubusercontent.com/profii/srl_transformers/main/dataset/train.tsv'


In [12]:
df = pd.read_csv(path_train, sep='\t', header= None, names=['data', 'label'],
                 quoting=3, skip_blank_lines=False).fillna('_nan')

In [13]:
df.head(22)

Unnamed: 0,data,label
0,also,O
1,",",O
2,i,O
3,have,O
4,recently,O
5,discovered,O
6,advil,B-Object
7,liquigels,O
8,work,O
9,much,O


In [14]:
df.shape

(63408, 2)

In [15]:
df.data[592]

'"'

## Preprocessing

    - Separating data into sentences with empty lines (NaN).
    - Clean punctuation into single dot.
    - Reconstruct labels into [0, 1, 2, 3, 4, 5, 6]

In [16]:
# Separating data into sentences with empty lines (NaN)

def separate_text(df):
    sents = []
    tags = []
    sentence = []
    label = []

    for word, tag in df.values:
        if word == '_nan':
            sents.append(sentence)
            tags.append(label)
            sentence = []
            label = []
        else:
            word = re.sub(r"[\"\—\#\$\%\&\'\(\)\*\+\,\–\-\/\:\;\<\=\>\?\@\[\\\]\^\?\!\_\`\{\|\}\~\«\»ѣ\№]", ".", word)
            word = re.sub(r"[.]+", ".", word)
            sentence.append(word)
            label.append(tag)
            # label.append(labels_to_ids[tag])

    return sents, tags

In [21]:
labels_to_ids = {k: v for v, k in enumerate(['O', 'B-Object', 'I-Object', 'B-Aspect', 'I-Aspect', 'B-Predicate', 'I-Predicate'])}
ids_to_labels = {v: k for v, k in enumerate(['O', 'B-Object', 'I-Object', 'B-Aspect', 'I-Aspect', 'B-Predicate', 'I-Predicate'])}

# Appling cleaning to df
sents, tags = separate_text(df)

In [22]:
labels_to_ids

{'O': 0,
 'B-Object': 1,
 'I-Object': 2,
 'B-Aspect': 3,
 'I-Aspect': 4,
 'B-Predicate': 5,
 'I-Predicate': 6}

In [2]:
# sents[0], tags[0]

In [24]:
def align_label(texts, labels, max_length=150, label_all_tokens=True):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=max_length, is_split_into_words=True)

    l = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)

            elif word_idx != previous_word_idx:
                label_ids.append(labels_to_ids[label[word_idx]])

            else:
                label_ids.append(labels_to_ids[label[word_idx]] if label_all_tokens else -100)

            previous_word_idx = word_idx
        l.append(label_ids)
    tokenized_inputs["labels"] = l

    return tokenized_inputs

In [25]:
FILE_VOCAB = "vocab.json"
FILE_TAGS = "tags.json"
FILE_DATASET = "dataset.txt"
FILE_DATASET_CACHE = "dataset_cache_{}.npz"


class Preprocessor:
    def __init__(self, config_dir, save_config_dir=None, verbose=True):
        self.config_dir = config_dir
        self.verbose = verbose

        self.vocab, self.vocab_dict = self.__load_list_file(FILE_VOCAB, offset=1, verbose=verbose)
        self.tags, self.tags_dict = self.__load_list_file(FILE_TAGS, verbose=verbose)
        if save_config_dir:
            self.__save_config(save_config_dir)

        self.PAD_IDX = 0
        self.OOV_IDX = len(self.vocab)
        self.__adjust_vocab()

    def __load_list_file(self, file_name, offset=0, verbose=False):
        file_path = join(self.config_dir, file_name)
        if not exists(file_path):
            raise ValueError('"{}" file does not exist.'.format(file_path))
        else:
            elements = load_json_file(file_path)
            elements_dict = {w: idx + offset for idx, w in enumerate(elements)}
            if verbose:
                print("config {} loaded".format(file_path))
            return elements, elements_dict

    def __adjust_vocab(self):
        self.vocab.insert(0, PAD)
        self.vocab_dict[PAD] = 0

        self.vocab.append(OOV)
        self.vocab_dict[OOV] = len(self.vocab) - 1



        # split the dataset
        total_count = len(xs)
        assert total_count == len(ys)
        val_count = int(total_count * val_split)
        test_count = int(total_count * test_split)
        train_count = total_count - val_count - test_count
        assert train_count > 0 and val_count > 0

        indices = np.cumsum([0, train_count, val_count, test_count])
        datasets = [(xs[s:e], ys[s:e]) for s, e in zip(indices[:-1], indices[1:])]
        print("datasets loaded:")
        for (xs_, ys_), name in zip(datasets, ["train", "val", "test"]):
            print("\t{}: {}, {}".format(name, xs_.shape, ys_.shape))
        return datasets

    def decode_tags(self, batch_tags):
        batch_tags = [
            [self.tags[t] for t in tags]
            for tags in batch_tags
        ]
        return batch_tags



In [None]:
    def __save_config(self, dst_dir):
        char_file = join(dst_dir, FILE_VOCAB)
        save_json_file(self.vocab, char_file)

        tag_file = join(dst_dir, FILE_TAGS)
        save_json_file(self.tags, tag_file)

        if self.verbose:
            print("tag dict file => {}".format(tag_file))
            print("tag dict file => {}".format(char_file))

    @staticmethod
    def __cache_file_path(corpus_dir, max_seq_len):
        return join(corpus_dir, FILE_DATASET_CACHE.format(max_seq_len))

    def load_dataset(self, corpus_dir, val_split, test_split, max_seq_len):
        """load the train set

        :return: (xs, ys)
            xs: [B, L]
            ys: [B, L, C]
        """
        ds_path = self.__cache_file_path(corpus_dir, max_seq_len)
        if not exists(ds_path):
            xs, ys = self.__build_corpus(corpus_dir, max_seq_len)
        else:
            print("loading dataset {} ...".format(ds_path))
            dataset = np.load(ds_path)
            xs, ys = dataset["xs"], dataset["ys"]

        xs, ys = map(
            torch.tensor, (xs, ys)
        )

In [None]:
    def sent_to_vector(self, sentence, max_seq_len=0):
        max_seq_len = max_seq_len if max_seq_len > 0 else len(sentence)
        vec = [self.vocab_dict.get(c, self.OOV_IDX) for c in sentence[:max_seq_len]]
        return vec + [self.PAD_IDX] * (max_seq_len - len(vec))

    def tags_to_vector(self, tags, max_seq_len=0):
        max_seq_len = max_seq_len if max_seq_len > 0 else len(tags)
        vec = [self.tags_dict[c] for c in tags[:max_seq_len]]
        return vec + [0] * (max_seq_len - len(vec))

    def __build_corpus(self, corpus_dir, max_seq_len):
        file_path = join(corpus_dir, FILE_DATASET)
        xs, ys = [], []
        with open(file_path, encoding="utf8") as f:
            for idx, line in tqdm(enumerate(f), desc="parsing {}".format(file_path)):
                fields = line.strip().split("\t")
                if len(fields) != 2:
                    raise ValueError("format error in line {}, tabs count: {}".format(idx + 1, len(fields) - 1))

                sentence, tags = fields
                try:
                    if sentence[0] == "[":
                        sentence = json.loads(sentence)
                    tags = json.loads(tags)
                    xs.append(self.sent_to_vector(sentence, max_seq_len=max_seq_len))
                    ys.append(self.tags_to_vector(tags, max_seq_len=max_seq_len))
                    if len(sentence) != len(tags):
                        raise ValueError('"sentence length({})" != "tags length({})" in line {}"'.format(
                            len(sentence), len(tags), idx + 1))
                except Exception as e:
                    raise ValueError("exception raised when parsing line {}\n\t{}\n\t{}".format(idx + 1, line, e))

        xs, ys = np.asarray(xs), np.asarray(ys)

        # save train set
        cache_file = self.__cache_file_path(corpus_dir, max_seq_len)
        np.savez(cache_file, xs=xs, ys=ys)
        print("dataset cache({}, {}) => {}".format(xs.shape, ys.shape, cache_file))
        return xs, ys

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_rnn_layers=1, rnn="lstm"):
        super(BiRnnCrf, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tagset_size = tagset_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        RNN = nn.LSTM if rnn == "lstm" else nn.GRU
        self.rnn = RNN(embedding_dim, hidden_dim // 2, num_layers=num_rnn_layers,
                       bidirectional=True, batch_first=True)
        self.crf = CRF(hidden_dim, self.tagset_size)

    def __build_features(self, sentences):
        masks = sentences.gt(0)
        embeds = self.embedding(sentences.long())

        seq_length = masks.sum(1)
        sorted_seq_length, perm_idx = seq_length.sort(descending=True)
        embeds = embeds[perm_idx, :]

        pack_sequence = pack_padded_sequence(embeds, lengths=sorted_seq_length, batch_first=True)
        packed_output, _ = self.rnn(pack_sequence)
        lstm_out, _ = pad_packed_sequence(packed_output, batch_first=True)
        _, unperm_idx = perm_idx.sort()
        lstm_out = lstm_out[unperm_idx, :]

        return lstm_out, masks

    def loss(self, xs, tags):
        features, masks = self.__build_features(xs)
        loss = self.crf.loss(features, tags, masks=masks)
        return loss

    def forward(self, xs):
        # Get the emission scores from the BiLSTM
        features, masks = self.__build_features(xs)
        scores, tag_seq = self.crf(features, masks)
        return scores, tag_seq