In [None]:
# Prepare packages on remote machine
# !pip install datasets
# !pip install seqeval
!pip install transformers
!pip install bertviz

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 2.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 17.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 22.6MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25

In [None]:
# Import preprocessed data
from pathlib import Path
import re
import random
from random import randint

random.seed(1217)

def read_data(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

sent_texts, sent_tags = read_data('drive/MyDrive/master-thesis/data/batched.txt')

# Assemble a varying amount of sentences per encoding (to avoid giving away PERIOD "for free")
texts = []
tags = []
i = 0
nb_sentences = len(sent_texts)
while i < nb_sentences:
    nb_rows = randint(3, 7)
    texts.append([item  for sent in sent_texts[i:i+nb_rows] for item in sent])
    tags.append([item for sent in sent_tags[i:i+nb_rows] for item in sent])
    i += nb_rows

# texts = sent_texts
# tags = sent_tags

# Debugging
# texts = texts[:100]
# tags = tags[:100]

print("Number of docs:", len(texts), "(tags:",len(tags),")")
print(texts[1][10:17], tags[1][10:17])

Number of docs: 3968 (tags: 3968 )
['tillbaka', 'konkurrerande', 'snustillverkares', 'produkter', 'genom', 'att', 'hindra'] ['EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY']


In [None]:
# Split data
from sklearn.model_selection import train_test_split
_, val_texts, _, val_tags = train_test_split(texts, tags, test_size=.2, shuffle=False)

# unique_tags = set(tag for doc in tags for tag in doc)
unique_tags = ['PERIOD', 'EMPTY', 'COMMA', 'QUESTION']
print(val_texts[0])
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}
print(id2tag)

['vår', 'ambition', 'är', 'att', 'ännu', 'starkare', 'knyta', 'samman', 'utbildning', 'forskning', 'och', 'samverkan', 'i', 'kreativa', 'kunskapsmiljöer', 'på', 'så', 'sätt', 'kan', 'vi', 'på', 'basis', 'av', 'starka', 'akademiska', 'miljöer', 'ta', 'oss', 'an', 'och', 'finna', 'lösningar', 'på', 'de', 'samhällsutmaningar', 'vi', 'står', 'inför', 'detta', 'är', 'ett', 'dynamiskt', 'arbete', 'där', 'alla', 'delar', 'inom', 'universitetet', 'samverkar']
{0: 'PERIOD', 1: 'EMPTY', 2: 'COMMA', 3: 'QUESTION'}


In [None]:
# Tokenize data
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')

# Encode our concatenated data
encoded_texts = [tokenizer.encode(sent, add_special_tokens=True, is_split_into_words=True) for sent in val_texts] # sent = sentence
# Ignore sentences that are too long
encoded_texts = [sent for sent in encoded_texts if len(sent)<512]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_texts])
print('Max length: ', max_len)

# Pad sentences according to longest sentence to create a tensor, truncate to keep within max limit
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, max_length=max_len)

print(val_encodings.input_ids[0])
print(len(val_encodings.input_ids))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=491.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=399162.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=48.0, style=ProgressStyle(description_w…


Max length:  286
[2, 671, 13276, 54, 48, 1306, 7146, 20095, 1355, 1861, 4574, 36, 9311, 31, 23427, 35044, 36804, 68, 181, 692, 178, 186, 68, 30231, 65, 4296, 19095, 14029, 557, 760, 142, 36, 5104, 8004, 68, 102, 14012, 145, 399, 1318, 186, 1195, 1148, 654, 54, 137, 46676, 1497, 256, 440, 2288, 653, 11867, 40938, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

val_labels = encode_tags(val_tags, val_encodings)

In [None]:
%%time
from transformers import AutoModel
import torch.nn as nn
from transformers.modeling_outputs import TokenClassifierOutput

class prestoBERT(nn.Module):
    def __init__(self, num_labels):
        super(prestoBERT, self).__init__()
        self.num_labels = num_labels

        self.bert = AutoModel.from_pretrained('KB/bert-base-swedish-cased', num_labels=len(unique_tags))
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size, self.num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):

        return_dict = return_dict if return_dict is not None else self.bert.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            # print("Labels not None")
            loss_fct = nn.CrossEntropyLoss()
            # Disregard the loss of inactive parts 
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

CPU times: user 156 ms, sys: 16 ms, total: 172 ms
Wall time: 186 ms


In [None]:
# Load fine-tuned model
from bertviz import head_view, model_view
from transformers import AutoTokenizer
import torch

unique_tags = ['PERIOD', 'EMPTY', 'COMMA', 'QUESTION']
model = prestoBERT(4)
model.load_state_dict(torch.load('drive/MyDrive/master-thesis/pytorch_model.bin'))
tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501379977.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at KB/bert-base-swedish-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
sentence = "ashdas"
inputs = tokenizer.encode_plus(sentence, return_tensors='pt', truncation=True, padding=True, max_length=None)
input_ids = inputs['input_ids']

# Encode our concatenated data
# encoded_texts = [tokenizer.encode(sent, add_special_tokens=True, is_split_into_words=True) for sent in val_texts] # sent = sentence
print(val_texts[0])
encoded_texts = [tokenizer.encode_plus(sent, return_tensors='pt', truncation=True, padding=True, max_length=None, is_split_into_words=True) for sent in val_texts] # sent = sentence

# todo: testa färre för debug
# encoded_texts = encoded_texts[:2]
print("nb sentences:", len(encoded_texts))

# Ignore sentences that are too long
# Keep track with one-hot encoded include yes/no
ignore_onehot = np.ones(len(encoded_texts))
for i in range(len(encoded_texts)):
    if not len(encoded_texts[i]['input_ids'][0])<512:
        ignore_onehot[i] = 0
# print(ignore_onehot[:2])
# Remove true labels accordingly
# true_tags_lists = [val_texts[i] for i in range(len(val_texts)) if (ignore_onehot[i] == 1)]

# encoded_texts = [sent for  if len(sent)<512]

predictions_per_sent = []
for encoded_text in encoded_texts:
    inputs = encoded_text
    input_ids = inputs['input_ids']

    predictions = model.forward(input_ids).logits.detach()
    pred_per_word = predictions.detach()[0][1:-1] # remove CLS and SEP todo: gör vi inte det dubbelt nu??
    predictions = np.argmax(predictions, axis=2).detach().ravel()[1:-1] # remove CLS and SEP todo: gör vi inte det dubbelt nu??
    predictions = [unique_tags[int(p)] for p in predictions]
    predictions_per_sent.append(predictions)

# print(predictions[:3])
# print(input_ids)
# print(val_labels[0][:3])
# print(len(sent_tags))

['vår', 'ambition', 'är', 'att', 'ännu', 'starkare', 'knyta', 'samman', 'utbildning', 'forskning', 'och', 'samverkan', 'i', 'kreativa', 'kunskapsmiljöer', 'på', 'så', 'sätt', 'kan', 'vi', 'på', 'basis', 'av', 'starka', 'akademiska', 'miljöer', 'ta', 'oss', 'an', 'och', 'finna', 'lösningar', 'på', 'de', 'samhällsutmaningar', 'vi', 'står', 'inför', 'detta', 'är', 'ett', 'dynamiskt', 'arbete', 'där', 'alla', 'delar', 'inom', 'universitetet', 'samverkar']
nb sentences: 794


In [None]:
confusion_matrix = {
            "PERIOD": {"PERIOD": 0, "COMMA": 0, "QUESTION": 0, "EMPTY": 0},
            "COMMA": {"PERIOD": 0, "COMMA": 0, "QUESTION": 0, "EMPTY": 0},
            "QUESTION": {"PERIOD": 0, "COMMA": 0, "QUESTION": 0, "EMPTY": 0},
            "EMPTY": {"PERIOD": 0, "COMMA": 0, "QUESTION": 0, "EMPTY": 0}
        }

input_id_list = encoded_texts[0]["input_ids"][0].tolist()
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
print(len(tokens))
print("nb spliced:", len([tok for tok in tokens if "##" in tok]))
print("nb to predict:", len(tokens)-4-2)
# print(len(predictions_per_sent[0]))
print("non-100 labels:", len([lbls for lbls in val_labels[0] if lbls != -100]))
# for i in range(len(tokens)):
#     print(tokens[i], val_labels[0][i])
print((val_tags[0]))
print((val_labels[0]))
print(len(predictions_per_sent[0]))

for i in range(len(predictions_per_sent)):
    if ignore_onehot[i] != 0:
        predictions = predictions_per_sent[i]
        # print(len(predictions))
        k = 0
        for j in range(len(predictions)):
            # print(j,k)
            if val_labels[i][j+1] != -100:
                pred_tag = predictions[j]
                true_tag = val_tags[i][k]
                # print(pred_tag, true_tag)
                # Try to exclude last period since it's free
                if not (k == len(val_tags[i])-1):
                    confusion_matrix[pred_tag][true_tag] = confusion_matrix[pred_tag][true_tag] + 1
                k += 1
        # print(k)

for key in confusion_matrix:
    string = "{"
    for class_ in confusion_matrix[key]:
        val = confusion_matrix[key][class_]
        string += str(val)
        string += ", "
    string = string[:-2]
    string += "},"
    print(string)

55
nb spliced: 4
nb to predict: 49
non-100 labels: 49
['EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'COMMA', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'PERIOD', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'PERIOD', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'PERIOD']
[-100, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, -100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -100, -100, -100, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,

In [None]:
# model_view(attention, full_tokens)

In [None]:
# head_view(attention, full_tokens)