In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
# Set up root directory to PATH
import sys
from pathlib import Path
root_path = str(Path.cwd().parent)
if root_path not in sys.path:
    sys.path.append(root_path)

# Import external libraries
from scandeval import load_dataset
import pandas as pd

# Import local scripts
from src import get_ner_trainer

## Load datasets and model

In [3]:
dataset_names = ['dane', 'norne-nb', 'norne-nn', 'suc3', 'wikiann-is', 'wikiann-fo']
all_datasets = {name: pd.concat((load_dataset(name)[0], 
                                 load_dataset(name)[2]), axis=1) 
                for name in dataset_names}

## Concatenating all the datasets

In [4]:
fully_concatenated = (pd.concat(all_datasets.values(), axis=0)
                        .reset_index(drop=True))
print(f'There are {len(fully_concatenated):,} documents in the dataset.')
fully_concatenated.head()

There are 100,151 documents in the dataset.


Unnamed: 0,doc,tokens,ner_tags
0,På fredag har SID inviteret til reception i SI...,"[På, fredag, har, SID, inviteret, til, recepti...","[O, O, O, B-ORG, O, O, O, O, B-LOC, O, O, O, O..."
1,Eller slet og ret tykke og fede i mere eller m...,"[Eller, slet, og, ret, tykke, og, fede, i, mer...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Under rejser og ophold i udlandet følger sygep...,"[Under, rejser, og, ophold, i, udlandet, følge...","[O, O, O, O, O, O, O, O, O, O, O, O]"
3,Hele det smukt restaurerede hus vrimler med bø...,"[Hele, det, smukt, restaurerede, hus, vrimler,...","[O, O, O, O, O, O, O, O, O, B-PER, I-PER, O, O..."
4,Hun er selv på femte år lykkeligt gift med sin...,"[Hun, er, selv, på, femte, år, lykkeligt, gift...","[O, O, O, O, O, O, O, O, O, O, B-MISC, O, O, B..."


In [7]:
trainer, val = get_ner_trainer(model_id='NbAiLab/nb-bert-base',
                               df=fully_concatenated)

[32m2021-09-17 20:02:11,010 [INFO] <src.trainer>
↳ [0m[33mLoading model[0m
[32m2021-09-17 20:02:18,297 [INFO] <src.trainer>
↳ [0m[33mConverting dataframe to HuggingFace dataset[0m
[32m2021-09-17 20:02:18,518 [INFO] <src.trainer>
↳ [0m[33mTokenising and aligning dataset[0m
[32m2021-09-17 20:02:46,606 [INFO] <src.trainer>
↳ [0m[33mInitialising Trainer object[0m


In [None]:
trainer.train()

In [None]:
trainer.evaluate(val)

## Ensuring equal language contribution

In [None]:
min_length = min(len(df) for df in dataset_dict.values())
subsampled = (pd.concat([df.sample(min_length) 
                        for df in datasets.values()], axis=0)
                .reset_index(drop=True))
print(f'There are {len(subsampled):,} documents in the dataset.')
subsampled.head()

In [None]:
trainer, val = get_ner_trainer(model_id='NbAiLab/nb-bert-base',
                               df=subsampled)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(val)