# Step 1: Build ASR pipeline
## Data preprocessing

In [1]:
from collections import defaultdict
import numpy as np
from pipelines.evaluation import get_or_filter_from_list, setup_database
import json

error_database_size = 0.5
filters = {
    "train": ["train"],
    "dev": ["train", "dev"],
    "test": ["train", "dev", "test"]
}
seed = 42
error_document_id_filter = defaultdict(list)
rng = np.random.default_rng(seed=seed)

error_document_store, error_search = setup_database(
    "ann_10_gazetteers_error_dataset_868514705", "ann", 10)

with open("/home/loebbert/projects/thesis/evaluations/false_positives.json", "r", encoding="utf-8") as file:
    false_positives = json.load(file)

for part in filters:
    current_filter = get_or_filter_from_list("dataset", filters[part])
    current_filter["$not"] = {  # type: ignore
        "doc_id": [doc_id for doc_id in false_positives[part]]
    }
    doc_count = error_document_store.get_document_count(
        current_filter)  # type: ignore
    filter_mask = rng.choice(doc_count,
                                int(doc_count * error_database_size),
                                replace=False).tolist()
    docs = error_document_store.get_all_documents(
        filters=current_filter)  # type: ignore
    for search_mask, doc in enumerate(docs):
        if search_mask in filter_mask and doc.id not in error_document_id_filter[
                part]:
            error_document_id_filter[part].append(doc.id)


  return self.fget.__get__(instance, owner)()


In [3]:
for key, values in error_document_id_filter.items():
    print(key, len(values))

train 706
dev 1040
test 1498


In [5]:
doc = {
    "tokens": [
        "@paulwalk", "It", "'s", "the", "view", "from", "where", "I", "'m",
        "living", "for", "two", "weeks", ".", "Empire", "State", "Building",
        "=", "ESB", ".", "Pretty", "bad", "storm", "here", "last", "evening",
        "."
    ],
    "extended": [
        "@paulwalk", "It", "'s", "the", "view", "from", "where", "I", "'m",
        "living", "for", "two", "weeks", ".", "Empire", "State", "Building",
        "=", "ESB", ".", "Pretty", "bad", "storm", "here", "last", "evening",
        "."
    ],
    "entities": [{
        "type": "location",
        "start": 14,
        "end": 17
    }, {
        "type": "location",
        "start": 18,
        "end": 19
    }],
    "doc_id":
    "wnut_train_0"
}
sentence = " ".join(doc["tokens"])

In [6]:
results = error_search.run(query=sentence,
                           params={
                               "filters": {
                                   "$and": {
                                       "$not": {
                                           "doc_id": [doc["doc_id"]],
                                           "content": sentence,
                                           "_id":
                                           error_document_id_filter["train"]
                                       },
                                       **{
                                           "$or": [{
                                               "dataset": ["train"]
                                           }]
                                       }
                                   }
                               }
                           })


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.63it/s]


In [21]:
docs = error_document_store.get_all_documents(
    filters={
        "$and": {
            "$not": {
                "doc_id": [doc["doc_id"]],
                "content": sentence,
                "_id": error_document_id_filter["dev"]
            },
            **get_or_filter_from_list("dataset", filters["dev"])
        }
    })


In [12]:
error_document_store.get_document_count()

3190

In [22]:
len(docs)

1233

In [8]:
results["documents"][0]

<Document: {'content': 'NYC', 'content_type': 'text', 'score': 0.5011589979241876, 'meta': {'data_type': 'gazetteers', 'type': 'location', 'doc_id': ['wnut_train_442', 'wnut_train_473', 'wnut_train_1830', 'wnut_train_2098', 'wnut_train_2248', 'wnut_train_2478'], 'dataset': ['train']}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '3e40989e0035845aed63ad3b9989e9ac'}>

In [1]:
from data_preparation.wnut import wnut_to_json

wnut_to_json("datasets/wnut/wnut17train.conll", 
                  "datasets/wnut/emerging.dev.conll",
                  "datasets/wnut/emerging.test.conll")

Annotation error:  datasets/wnut/emerging.test.conll 18828 ['Advertise', 'I-creative-work']
Annotation error:  datasets/wnut/emerging.test.conll 18829 ['Anything', 'I-creative-work']


In [14]:
from data_preparation.conll03 import conll03_to_json

conll03_to_json("data/conll03/mine/train.txt",
                "data/conll03/mine/dev.txt",
                "data/conll03/mine/test.txt")


In [2]:
types['entities']

{'location': {'short': 'location'},
 'group': {'short': 'group'},
 'corporation': {'short': 'corporation'},
 'person': {'short': 'person'},
 'creative-work': {'short': 'creative-work'},
 'product': {'short': 'product'}}

In [7]:
test[204]

{'tokens': ['For',
  'more',
  'info',
  'about',
  'this',
  'and',
  'local',
  'views',
  'on',
  'the',
  'matter',
  'check',
  'out',
  'where',
  'OP',
  'took',
  'this',
  'from',
  '.'],
 'extended': ['For',
  'more',
  'info',
  'about',
  'this',
  'and',
  'local',
  'views',
  'on',
  'the',
  'matter',
  'check',
  'out',
  'where',
  'OP',
  'took',
  'this',
  'from',
  '.'],
 'entities': [Entity(type='creative-work', start=6, end=8)]}

In [4]:
from data_preprocessing.tokenize import tokenize_json
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small", model_max_length=4096)

tokenize_json(tokenizer,
              "datasets/wnut/wnut17train.json",
              "datasets/wnut/emerging.dev.json",
              "datasets/wnut/emerging.test.json",
              "datasets/wnut/wnut_types.json")

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
from data_preprocessing.tokenize import tokenize_json
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small", model_max_length=4096)

tokenize_json(tokenizer, "datasets/conll03/conll03_train.json",
              "datasets/conll03/conll03_dev.json",
              "datasets/conll03/conll03_test.json",
              "datasets/conll03/conll03_types.json")

  from .autonotebook import tqdm as notebook_tqdm


conll03_train: dropped 5 inst_ids: 8884 10279 13068 13234 13672
conll03_dev: dropped 8 inst_ids: 1055 2184 2185 2594 2595 2616 2617 2903
conll03_test: dropped 8 inst_ids: 33 46 47 202 203 1841 3198 3199


In [None]:
# labels can have multiple values concatenated via commas!

In [1]:
from data_preprocessing.tensorize import NERDataProcessor
from data_preprocessing.tokenize import MENTION_START, MENTION_END
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small", model_max_length=4096)
tokenizer.add_tokens(MENTION_START)
tokenizer.add_tokens(MENTION_END)

config = {
    "mention_start_token": MENTION_START,
    "mention_end_token": MENTION_END
}

processor = NERDataProcessor(config, tokenizer,
                             "datasets/wnut/wnut17train.t5-small.jsonlines",
                             "datasets/wnut/emerging.dev.t5-small.jsonlines",
                             "datasets/wnut/emerging.test.t5-small.jsonlines",
                             "datasets/wnut/wnut_types.json")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
processor.get_cache_path()

'datasets/wnut/cached.tensors.t5-small.bin'

In [2]:
train, dev, test = processor.get_tensor_samples()

In [3]:
processor.stored_info["example"]["emerging.test_22"]

{'doc_id': 'emerging.test_22',
 'sentence': ['▁Rep',
  'ly',
  'ing',
  '▁to',
  '▁another',
  '▁question',
  ',',
  '▁B',
  'ham',
  're',
  '▁said',
  '▁the',
  '▁jaw',
  'ans',
  '▁deployed',
  '▁at',
  '▁places',
  '▁such',
  '▁as',
  '▁Si',
  'a',
  'chen',
  '▁Gla',
  'cier',
  '▁are',
  '▁provided',
  '▁with',
  '▁the',
  '▁best',
  '-',
  'quality',
  '▁winter',
  '▁clothing',
  '.',
  '</s>'],
 'input_sentence': ['▁named',
  '▁entity',
  '▁recognition',
  ':',
  '▁Rep',
  'ly',
  'ing',
  '▁to',
  '▁another',
  '▁question',
  ',',
  '▁B',
  'ham',
  're',
  '▁said',
  '▁the',
  '▁jaw',
  'ans',
  '▁deployed',
  '▁at',
  '▁places',
  '▁such',
  '▁as',
  '▁Si',
  'a',
  'chen',
  '▁Gla',
  'cier',
  '▁are',
  '▁provided',
  '▁with',
  '▁the',
  '▁best',
  '-',
  'quality',
  '▁winter',
  '▁clothing',
  '.',
  '</s>'],
 'target_sentence': ['▁Rep',
  'ly',
  'ing',
  '▁to',
  '▁another',
  '▁question',
  ',',
  '<m>',
  '<m>',
  '▁B',
  'ham',
  're',
  '</m>',
  '</m>',
  '▁said'

In [4]:
import pickle

test_multiple = None
for (doc_key, subtoken_map, sample) in test.data:
    if doc_key == "emerging.test_22":
        test_multiple = (doc_key, subtoken_map, sample)
        break
with open("tests/data/wnut_nested_batch_1.pkl", "wb") as file:
    pickle.dump(test_multiple, file)

with open("tests/data/wnut_nested_batch_10.pkl", "wb") as file:
    res = [test_multiple, *test.data[:9]]
    print(len(res))
    pickle.dump(res, file)

with open("tests/data/wnut_batch_1.pkl", "wb") as file:
    pickle.dump(train.data[0], file)

with open("tests/data/wnut_batch_10.pkl", "wb") as file:
    pickle.dump(train.data[:10], file)

10


In [2]:
5/3, 5//3

(1.6666666666666667, 1)

In [5]:
from data_preprocessing.tensorize import ner_collate_fn
import pickle

with open("tests/data/wnut_batch_10.pkl", "rb") as file:
            data_point = pickle.load(file)
(doc_keys, subtoken_maps, batch) = ner_collate_fn(data_point)
assert len(subtoken_maps) == 10

In [6]:
subtoken_maps

([0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  4,
  5,
  6,
  6,
  6,
  6,
  6,
  6,
  7,
  8,
  8,
  9,
  9,
  9,
  10,
  11,
  12,
  13,
  13,
  13,
  13,
  13,
  14,
  14,
  15,
  16,
  17,
  18,
  18,
  18,
  18,
  18,
  19,
  19,
  19,
  19,
  19,
  19,
  20,
  20,
  20,
  20,
  21,
  22,
  22,
  22,
  22,
  23,
  23,
  23,
  23,
  23,
  24,
  24,
  24,
  24,
  25,
  26,
  26,
  27,
  28,
  29,
  30],
 [0,
  0,
  1,
  2,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  4,
  5,
  6,
  6,
  7,
  7,
  7,
  8,
  9,
  9,
  9,
  9,
  9,
  9,
  9,
  10,
  11,
  12,
  12,
  13,
  13,
  14,
  14,
  14,
  15,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  16,
  17,
  18,
  19,
  19,
  20,
  21,
  22,
  22,
  22,
  23,
  24,
  24,
  25,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  31,
  32,
  32,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  34,
  34,
  34,
  34,
  34,
  35],
 [0,
  0,
  1,
  1,
  2,
  2,
  2,
  3,
  3,
  3,
  4,
  5,

In [6]:
from transformers import T5Tokenizer, T5Model

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5Model.from_pretrained("t5-small")

input_ids = tokenizer(
    "Studies have been shown that owning a dog is good for you", return_tensors="pt"
).input_ids  # Batch size 1
decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

# forward pass
outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
last_hidden_states = outputs.last_hidden_state

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [1]:
import torch

torch.__version__

'2.0.0'

In [9]:
import os
"/" + os.path.join(*os.getcwd().split(os.path.sep)[:-1])

'/home/loebbert/projects'

In [1]:
from models.metrics import F1ASP

metric = F1ASP()

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
metric.update([(2, 3, 5)], [])
metric.update([], [])

In [28]:
metric.compute()

0.6530612707138062

In [1]:
from data.statistics_json import create_dataset_stats

asp_inputs, asp_entities = create_dataset_stats("data/conll03/asp/conll03_train.json",
                     "data/conll03/asp/conll03_dev.json",
                     "data/conll03/asp/conll03_test.json")


In [2]:
asp_inputs, asp_entities

(       Sentences  Tokens  Entities
 train       8564  203621     23484
 dev         2176   51362      5938
 test        1948   46435      5640,
        Total     LOC    MISC     ORG     PER
 train  23484  7132.0  3438.0  6319.0  6595.0
 dev     5938  1834.0   922.0  1341.0  1841.0
 test    5640  1663.0   702.0  1659.0  1616.0)

In [8]:
entitiy = asp_entities["Total"].copy()
entitiy.name = "asdf"
entitiy

train    23484
dev       5938
test      5640
Name: asdf, dtype: int64

In [9]:
entitiy2 = asp_entities["Total"].copy()
entitiy2.name = "asdf2"
entitiy2

train    23484
dev       5938
test      5640
Name: asdf2, dtype: int64

In [10]:
entitiy

train    23484
dev       5938
test      5640
Name: asdf, dtype: int64

In [11]:
import pandas as pd

pd.concat([entitiy, entitiy2], axis=1)

Unnamed: 0,asdf,asdf2
train,23484,23484
dev,5938,5938
test,5640,5640


In [25]:
stats_asp["train_entities"].loc["sum"].to_dict()

{'ORG': 6319.0,
 'entity_len': 34018.0,
 'MISC': 3438.0,
 'PER': 6595.0,
 'LOC': 7132.0}

In [17]:
from data.statistics_json import create_dataset_stats

stats_mine = create_dataset_stats("data/conll03/mine/conll03_train.json",
                                 "data/conll03/mine/conll03_dev.json",
                                 "data/conll03/mine/conll03_test.json")
stats_mine["test_inputs"]

Unnamed: 0,tokens_len,tokens_sent_count,extended_len,extended_sent_count,entities_count
sum,51362.0,3472.0,51362.0,3472.0,5938.0
count,3250.0,3250.0,3250.0,3250.0,3250.0
mean,15.803692,1.068308,15.803692,1.068308,1.827077
std,12.603389,0.252312,12.603389,0.252312,1.77781
min,1.0,1.0,1.0,1.0,0.0
25%,7.0,1.0,7.0,1.0,1.0
50%,11.0,1.0,11.0,1.0,1.0
75%,24.0,1.0,24.0,1.0,2.0
90%,34.0,1.0,34.0,1.0,4.0
max,109.0,2.0,109.0,2.0,20.0


In [28]:
stats_mine["test_entities"]

Unnamed: 0,ORG,entity_len,MISC,PER,LOC
sum,1656.0,8087.0,693.0,1617.0,1662.0
count,1656.0,5628.0,693.0,1617.0,1662.0
mean,1.0,1.436923,1.0,1.0,1.0
std,0.0,0.655608,0.0,0.0,0.0
min,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,1.0
75%,1.0,2.0,1.0,1.0,1.0
90%,1.0,2.0,1.0,1.0,1.0
max,1.0,6.0,1.0,1.0,1.0


In [7]:
from data.statistics_json import create_dataset_stats

stats_hugginface = create_dataset_stats("data/conll03/huggingface/conll03_train.json",
                                  "data/conll03/huggingface/conll03_dev.json",
                                  "data/conll03/huggingface/conll03_test.json")
stats_hugginface["train_inputs"]

Unnamed: 0,tokens_len,tokens_sent_count,extended_len,extended_sent_count,entities_count
sum,203621.0,14833.0,203621.0,14833.0,23429.0
count,14041.0,14041.0,14041.0,14041.0,14041.0
mean,14.501887,1.056406,14.501887,1.056406,1.668613
std,11.602756,0.232251,11.602756,0.232251,1.527363
min,1.0,1.0,1.0,1.0,0.0
25%,6.0,1.0,6.0,1.0,1.0
50%,10.0,1.0,10.0,1.0,1.0
75%,22.0,1.0,22.0,1.0,2.0
90%,32.0,1.0,32.0,1.0,4.0
max,113.0,3.0,113.0,3.0,20.0


In [8]:
stats_hugginface["train_entities"]

Unnamed: 0,ORG,entity_len,MISC,PER,LOC
sum,6297.0,33954.0,3403.0,6600.0,7129.0
count,6297.0,23429.0,3403.0,6600.0,7129.0
mean,1.0,1.44923,1.0,1.0,1.0
std,0.0,0.698081,0.0,0.0,0.0
min,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,1.0
75%,1.0,2.0,1.0,1.0,1.0
90%,1.0,2.0,1.0,1.0,1.0
max,1.0,10.0,1.0,1.0,1.0


In [12]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset conll2003 (/home/loebbert/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)
100%|██████████| 3/3 [00:00<00:00, 684.26it/s]


In [6]:
ner_tags = {
    'O': 0,
    'B-PER': 1,
    'I-PER': 2,
    'B-ORG': 3,
    'I-ORG': 4,
    'B-LOC': 5,
    'I-LOC': 6,
    'B-MISC': 7,
    'I-MISC': 8
}
reversed_ner_tags = {v: k for k, v in ner_tags.items()}

In [10]:
for part in dataset:
    with open("datasets/conll03/huggingface/"+part+".txt", "w", encoding="utf-8") as file:
        for item in dataset[part]:
            for line in ["\t".join([token, "x", "x", reversed_ner_tags[ner_tag_id]]) for token, ner_tag_id in zip(item["tokens"], item["ner_tags"])]:
                file.write(line + "\n")
            file.write("\n")

In [13]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
tokenizer

In [5]:

import torch

input_ids = torch.ones(80)
indices = [15, 20, 45, 60]

tensors = torch.tensor_split(input_ids, indices)
for t in tensors:
    print(t.size())

torch.Size([15])
torch.Size([5])
torch.Size([25])
torch.Size([15])
torch.Size([20])


In [21]:
torch.cat(tensors).size()

torch.Size([80])

In [20]:
import numpy as np

dropout = 0.3
np.random.choice([0, 1], size=4, p=[dropout, 1-dropout])

array([1, 0, 0, 1])

In [9]:
import pickle as pkl

with open("/home/loebbert/projects/thesis/finetuning/gazetteers_result.pkl", "rb") as file:
    results = pkl.load(file)

res_df = results.get_dataframe()

Couldn't read config from 1 paths


In [11]:
res_df.sort_values(by=["val_f1"], ascending=False)

Unnamed: 0,val_f1,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,trial_id,experiment_id,date,timestamp,...,config/prepend_search_results,config/search_algorithm,config/search_topk,config/task_learning_rate,config/train_search_dropout,config/train_search_shuffle,config/use_labels,config/use_mentions,config/warmup_ratio,logdir
2,0.6411201,73.167605,False,,,5,bd14bf74,f75ce781b7ee4a9eb145dbffc2ac683e,2023-04-23_10-28-36,1682238516,...,False,ann,10,0.0003,0.0,False,True,False,0.218601,/home/loebbert/projects/thesis/finetuning/tune...
0,0.6083592,77.08331,False,,,5,a1802828,f75ce781b7ee4a9eb145dbffc2ac683e,2023-04-23_10-13-29,1682237609,...,False,bm25,10,0.0003,0.0,False,True,True,0.05,/home/loebbert/projects/thesis/finetuning/tune...
27,0.5958363,96.932659,False,,,5,a0dd966a,9b94e0c2f8024df287907a65b577f50c,2023-04-23_13-50-53,1682250653,...,False,ann+reranking,34,0.0003,0.50576,True,True,False,0.405567,/home/loebbert/projects/thesis/finetuning/tune...
29,0.591528,83.487009,False,,,5,0ef10912,9b94e0c2f8024df287907a65b577f50c,2023-04-23_14-04-07,1682251447,...,True,ann,18,0.001493,0.457002,False,True,True,0.592869,/home/loebbert/projects/thesis/finetuning/tune...
21,0.5776488,65.914054,False,,,5,ab5b2f76,706b03c483704a15b20b872bc9ebb3db,2023-04-23_12-56-39,1682247399,...,False,bm25,10,0.0003,0.10422,False,False,True,0.502675,/home/loebbert/projects/thesis/finetuning/tune...
25,0.5691057,62.904102,False,,,5,f6015800,e86baeb8252845f3b780ad7267bf305d,2023-04-23_13-23-19,1682248999,...,False,ann,18,0.0003,0.675624,True,False,True,0.218601,/home/loebbert/projects/thesis/finetuning/tune...
26,0.5641769,84.026278,False,,,5,43293d02,9b94e0c2f8024df287907a65b577f50c,2023-04-23_13-39-00,1682249940,...,False,ann+reranking,22,0.001852,0.401718,False,True,False,0.659878,/home/loebbert/projects/thesis/finetuning/tune...
19,0.563463,54.393825,False,,,5,e4e9af0d,706b03c483704a15b20b872bc9ebb3db,2023-04-23_12-43-01,1682246581,...,False,ann,9,0.001899,0.660481,True,False,False,0.804497,/home/loebbert/projects/thesis/finetuning/tune...
8,0.5591078,70.054316,False,,,5,48bbe3a6,63cf6ef6501646e0ba3b4dd3093594d6,2023-04-23_11-10-35,1682241035,...,False,ann+reranking,22,0.001969,0.75192,True,True,True,0.506261,/home/loebbert/projects/thesis/finetuning/tune...
12,0.5560538,78.132409,False,,,5,cda767f5,63cf6ef6501646e0ba3b4dd3093594d6,2023-04-23_11-44-17,1682243057,...,True,bm25,10,0.002741,0.0,False,True,True,0.218601,/home/loebbert/projects/thesis/finetuning/tune...


In [14]:
res_df.sort_values(by=["val_f1"], ascending=False).index.values[:2].tolist()

[2, 0]

In [16]:
best_results = []
for i in res_df.sort_values(by=["val_f1"], ascending=False).index.values[:1].tolist():
    best_results.append(results[i].config)
best_results

[{'asp_hidden_dim': 629,
  'asp_dropout_rate': 0.3,
  'asp_init_std': 0.019999999999999976,
  'asp_activation': 'tanh',
  'plm_learning_rate': 4.999999999999997e-05,
  'task_learning_rate': 0.00029999999999999987,
  'adam_weight_decay': 0.1,
  'warmup_ratio': 0.21860103276831117,
  'use_labels': True,
  'use_mentions': False,
  'prepend_search_results': False,
  'filter_exact_match': False,
  'filter_same_document': False,
  'search_algorithm': 'ann',
  'search_topk': 10,
  'train_search_dropout': 0.0,
  'train_search_shuffle': False,
  'plm_pretrained_name_or_path': 't5-base',
  'plm_tokenizer_name': 't5-small',
  'model_max_length': 4096,
  'mention_start_token': '<m>',
  'mention_end_token': '</m>',
  'num_labels': 6,
  'max_nest_depth': 1,
  'beam_size': 1,
  'plm_scheduler': 'linear_with_warmup',
  'task_scheduler': 'linear_with_warmup',
  'adam_eps': 1e-08,
  'num_epochs': 20,
  'gradient_accumulation_steps': 1,
  'batch_size': 40,
  'train_len': 3394,
  'fused': True,
  'search_

In [14]:
results[5].config

{'asp_hidden_dim': 355,
 'asp_dropout_rate': 0.3,
 'asp_init_std': 0.019999999999999976,
 'asp_activation': 'gelu_fast',
 'plm_learning_rate': 4.999999999999997e-05,
 'task_learning_rate': 0.0013868381781415849,
 'adam_eps': 9.999999999999984e-09,
 'adam_weight_decay': 0.1,
 'warmup_ratio': 0.6483865273543568,
 'use_labels': True,
 'use_mentions': True,
 'prepend_search_results': False,
 'filter_exact_match': False,
 'filter_same_document': False,
 'search_algorithm': 'bm25',
 'search_topk': 5,
 'train_search_dropout': 0.6317431305015401,
 'train_search_shuffle': False,
 'plm_pretrained_name_or_path': 't5-base',
 'plm_tokenizer_name': 't5-small',
 'model_max_length': 4096,
 'mention_start_token': '<m>',
 'mention_end_token': '</m>',
 'num_labels': 6,
 'max_nest_depth': 1,
 'beam_size': 1,
 'plm_scheduler': 'linear_with_warmup',
 'task_scheduler': 'linear_with_warmup',
 'num_epochs': 20,
 'gradient_accumulation_steps': 1,
 'batch_size': 40,
 'train_len': 3394,
 'fused': True,
 'search_d

In [1]:
from data_preparation.lowner import lowner_to_json

lowner_to_json(
    "/home/loebbert/projects/thesis/data/mlowner/en/train_lower.txt",
    "/home/loebbert/projects/thesis/data/mlowner/en/dev_lower.txt",
    "/home/loebbert/projects/thesis/data/mlowner/en/test_sub_lower.txt")
