In [None]:
# Prepare packages on remote machine
!pip install datasets
!pip install seqeval
!pip install transformers
!pip install wandb

Collecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/54/90/43b396481a8298c6010afb93b3c1e71d4ba6f8c10797a7da8eb005e45081/datasets-1.5.0-py3-none-any.whl (192kB)
[K     |█▊                              | 10kB 24.1MB/s eta 0:00:01[K     |███▍                            | 20kB 18.9MB/s eta 0:00:01[K     |█████                           | 30kB 11.2MB/s eta 0:00:01[K     |██████▉                         | 40kB 9.4MB/s eta 0:00:01[K     |████████▌                       | 51kB 7.4MB/s eta 0:00:01[K     |██████████▏                     | 61kB 7.3MB/s eta 0:00:01[K     |████████████                    | 71kB 8.3MB/s eta 0:00:01[K     |█████████████▋                  | 81kB 8.8MB/s eta 0:00:01[K     |███████████████▎                | 92kB 9.1MB/s eta 0:00:01[K     |█████████████████               | 102kB 8.5MB/s eta 0:00:01[K     |██████████████████▊             | 112kB 8.5MB/s eta 0:00:01[K     |████████████████████▍           | 122kB 8.5MB/s et

In [None]:
# Setup environment

from tqdm import tqdm
import matplotlib.pyplot as plt
import wandb

wandb.login()

%matplotlib inline
%load_ext tensorboard

%env WANDB_LOG_MODEL=true

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


env: WANDB_LOG_MODEL=true


In [None]:
# Import preprocessed data
from pathlib import Path
import re
import random
from random import randint

random.seed(1217)

def read_data(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

sent_texts, sent_tags = read_data('drive/MyDrive/master-thesis/data/batched.txt')

# Assemble a varying amount of sentences per encoding (to avoid giving away PERIOD "for free")
texts = []
tags = []
i = 0
nb_sentences = len(sent_texts)
while i < nb_sentences:
    nb_rows = randint(3, 7)
    # nb_rows=2
    texts.append([item  for sent in sent_texts[i:i+nb_rows] for item in sent])
    tags.append([item for sent in sent_tags[i:i+nb_rows] for item in sent])
    i += nb_rows

# texts = sent_texts
# tags = sent_tags

# Debugging
# texts = texts[:100]
# tags = tags[:100]

print("Number of docs:", len(texts), "(tags:",len(tags),")")
print(texts[1][10:17], tags[1][10:17])

Number of docs: 3968 (tags: 3968 )
['tillbaka', 'konkurrerande', 'snustillverkares', 'produkter', 'genom', 'att', 'hindra'] ['EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY', 'EMPTY']


In [None]:
# Split data
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2, shuffle=False)

# unique_tags = set(tag for doc in tags for tag in doc)
unique_tags = set(['EMPTY', 'PERIOD', 'COMMA', 'QUESTION'])
print(unique_tags)
print(val_texts[0])
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

{'PERIOD', 'EMPTY', 'COMMA', 'QUESTION'}
['vår', 'ambition', 'är', 'att', 'ännu', 'starkare', 'knyta', 'samman', 'utbildning', 'forskning', 'och', 'samverkan', 'i', 'kreativa', 'kunskapsmiljöer', 'på', 'så', 'sätt', 'kan', 'vi', 'på', 'basis', 'av', 'starka', 'akademiska', 'miljöer', 'ta', 'oss', 'an', 'och', 'finna', 'lösningar', 'på', 'de', 'samhällsutmaningar', 'vi', 'står', 'inför', 'detta', 'är', 'ett', 'dynamiskt', 'arbete', 'där', 'alla', 'delar', 'inom', 'universitetet', 'samverkar']


In [None]:
# Save test set for human baseline evaluation
with open("val_text_file.txt", 'w', encoding="utf-8") as output_file:
    output_file.write(' '.join([item for sublist in val_texts for item in sublist]))

with open("val_tags_file.txt", 'w', encoding="utf-8") as output_file:
        output_file.write(' '.join([item for sublist in val_tags for item in sublist]))

from google.colab import files
files.download('val_text_file.txt') 
files.download('val_tags_file.txt') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Tokenize data
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')

# Concatenate train data and test data
import numpy as np
all_texts = np.concatenate([train_texts, val_texts])

# Encode our concatenated data
encoded_texts = [tokenizer.encode(sent, add_special_tokens=True, is_split_into_words=True) for sent in all_texts] # sent = sentence
# Ignore sentences that are too long
encoded_texts = [sent for sent in encoded_texts if len(sent)<512]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_texts])
print('Max length: ', max_len)

# Pad sentences according to longest sentence to create a tensor, truncate to keep within max limit
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, max_length=max_len)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, max_length=max_len)

print(len(train_encodings.input_ids[0]))
print(train_encodings.input_ids[0])
print(len(val_encodings.input_ids[0]))
print(val_encodings.input_ids[0])
print(len(train_encodings.input_ids),"+",len(val_encodings.input_ids))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=491.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=399162.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=48.0, style=ProgressStyle(description_w…






Max length:  501
501
[2, 8213, 326, 8545, 76, 48, 1448, 3689, 8424, 48, 690, 146, 137, 29465, 49795, 36, 4005, 17691, 18070, 76, 48, 248, 59, 27048, 758, 413, 68, 59, 2347, 66, 15098, 2556, 358, 1500, 1061, 54, 82, 67, 48, 9251, 137, 43184, 9657, 126, 275, 82, 48, 22392, 23806, 91, 68, 20997, 181, 16165, 82, 59, 522, 3616, 1248, 35332, 16106, 5250, 244, 2610, 1094, 6519, 2222, 137, 27855, 12565, 76, 4273, 327, 7463, 21444, 284, 36557, 168, 49871, 100, 49851, 365, 8380, 18, 243, 13064, 3724, 230, 102, 10146, 3337, 59, 281, 19072, 351, 10852, 76, 59, 5107, 5432, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
import torch

# Implement custom dataset class
class prestoDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = prestoDataset(train_encodings, train_labels)
val_dataset = prestoDataset(val_encodings, val_labels)
# test_dataset = prestoDataset(test_encodings, test_labels)

In [None]:
# Network class
from transformers import AutoModel
import torch.nn as nn
#from transformers.modeling_outputs import MultipleChoiceModelOutput

class prestoBERT(nn.Module):
      def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

      def __getitem__(self, idx):
          item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
          item['labels'] = torch.tensor(self.labels[idx])
          return item
      
      def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = prestoDataset(train_encodings, train_labels)
val_dataset = prestoDataset(val_encodings, val_labels)

In [None]:
%%time
from transformers import AutoModel
import torch.nn as nn
from transformers.modeling_outputs import TokenClassifierOutput

class prestoBERT(nn.Module):
    def __init__(self, num_labels):
        super(prestoBERT, self).__init__()
        self.num_labels = num_labels

        self.bert = AutoModel.from_pretrained('KB/bert-base-swedish-cased', num_labels=len(unique_tags))
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size, self.num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):

        return_dict = return_dict if return_dict is not None else self.bert.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            # print("Labels not None")
            loss_fct = nn.CrossEntropyLoss()
            # Disregard the loss of inactive parts 
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


CPU times: user 49 µs, sys: 5 µs, total: 54 µs
Wall time: 66.5 µs


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import ClassLabel, load_dataset, load_metric

metric = load_metric("seqeval")

def compute_metrics(p):
    # label_list = list(unique_tags)
    label_list = id2tag
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def model_init(num_labels):
    return prestoBERT(num_labels=num_labels)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961.0, style=ProgressStyle(description…




In [None]:
torch.cuda.empty_cache()

In [None]:
# Training
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs=4,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training #16
    per_device_eval_batch_size=4,   # batch size for evaluation #64
    warmup_steps=0,              # number of warmup steps for learning rate scheduler
    learning_rate=5e-5,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    run_name='new run'
)

model = model_init(4)

trainer = Trainer(
    model=model,
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset todo: validation?
    compute_metrics=compute_metrics      # evaluation metrics
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501379977.0, style=ProgressStyle(descri…




In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mjohnnil[0m (use `wandb login --relogin` to force relogin)




Step,Training Loss
100,0.2071
200,0.0931
300,0.0825
400,0.0742
500,0.068
600,0.0651
700,0.0637
800,0.0625
900,0.0396
1000,0.0377


TrainOutput(global_step=3176, training_loss=0.040414234552935926, metrics={'train_runtime': 1629.363, 'train_samples_per_second': 1.949, 'total_flos': 0.0, 'epoch': 4.0, 'init_mem_cpu_alloc_delta': 2289852416, 'init_mem_gpu_alloc_delta': 499895808, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 515235840, 'train_mem_gpu_alloc_delta': 1505656832, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 0})

In [None]:
trainer.evaluate()



{'epoch': 4.0,
 'eval_accuracy': 0.9846513727582386,
 'eval_f1': 0.9844381748663138,
 'eval_loss': 0.06586877256631851,
 'eval_mem_cpu_alloc_delta': 4448256,
 'eval_mem_cpu_peaked_delta': 3387392,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 89449472,
 'eval_precision': 0.9844854808814444,
 'eval_recall': 0.9843908733972159,
 'eval_runtime': 19.8694,
 'eval_samples_per_second': 39.961}

In [None]:
print(list(unique_tags))
print(id2tag)

['PERIOD', 'EMPTY', 'COMMA', 'QUESTION']
{0: 'PERIOD', 1: 'EMPTY', 2: 'COMMA', 3: 'QUESTION'}


In [None]:
predictions, labels, _ = trainer.predict(val_dataset)
predictions = np.argmax(predictions, axis=2)
label_list = list(unique_tags)
# label_list = id2tag
# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100 and l != 1]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100 and l != 1]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results


  _warn_prf(average, modifier, msg_start, len(result))


{'ERIOD': {'f1': 0.8974431019949425,
  'number': 1788,
  'precision': 0.9017504234895539,
  'recall': 0.8931767337807607},
 'MPTY': {'f1': 0.0, 'number': 0, 'precision': 0.0, 'recall': 0.0},
 'OMMA': {'f1': 0.7089518668466037,
  'number': 1228,
  'precision': 0.7919597989949749,
  'recall': 0.6416938110749185},
 'UESTION': {'f1': 0.2857142857142857,
  'number': 6,
  'precision': 1.0,
  'recall': 0.16666666666666666},
 'overall_accuracy': 0.8925969845841097,
 'overall_f1': 0.7554218774734842,
 'overall_precision': 0.7241274658573597,
 'overall_recall': 0.7895433487756452}

In [None]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 31 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (1,432 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 160983 files and directories c

In [None]:
!transformers-cli login

2021-04-13 19:37:51.591990: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0

        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: Johnnil
Password: 
Login successful
Your token: WwuJfvsrlvbiLkwpOFxjxKcJveZipFUEPbTMpcvionviKuputWZgyhPhVLPnjFFEBgbGKTBVfSJhnvFTAdyaFCMpPYsJQDmIWFoisPUllxiVsJYzCPSAnREefNodOphE 

Your token has been saved to /root/.huggingface/token


In [None]:
!git lfs install

# Password required
!git clone https://Johnnil:password@huggingface.co/Johnnil/prestoBERT

Error: Failed to call git rev-parse --git-dir --show-toplevel: "fatal: not a git repository (or any of the parent directories): .git\n"
Git LFS initialized.
Cloning into 'prestoBERT'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 20 (delta 1), reused 0 (delta 0)[K
Unpacking objects: 100% (20/20), done.


In [None]:
!ls
# %cd Johnnil
%cd prestoBERT
# %cd model_name

!git config --global user.email "john.otis.nilsson@gmail.com"
# Tip: using the same email than for your huggingface.co account will link your commits to your profile
!git config --global user.name "Johnnil"

drive  prestoBERT  sample_data	      val_text_file.txt
logs   results	   val_tags_file.txt  wandb
/content/prestoBERT


In [None]:
!ls

results  special_tokens_map.json  tokenizer_config.json  vocab.txt


In [None]:
# %cd prestoBERT/

/content/prestoBERT


In [None]:
# %cd ..

/content


In [None]:
# !pwd

/content


In [None]:
# %rm prestoBERT -r

In [None]:
# Creates a file "result" in current directory
tokenizer.save_pretrained("./")
trainer.save_model()
# model.save_pretrained()
# model_to_save = model.module if hasattr(model, 'module') else model
# model_to_save.config.to_json()

In [None]:
!git add .
!git commit -m "New run"
!git push

[main 3311e20] New run
 1 file changed, 1 insertion(+), 1 deletion(-)
Git LFS: (1 of 1 files) 475.76 MB / 475.76 MB
Counting objects: 4, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 464 bytes | 464.00 KiB/s, done.
Total 4 (delta 1), reused 0 (delta 0)
To https://huggingface.co/Johnnil/prestoBERT
   81e7497..3311e20  main -> main
