In [1]:
import datasets
import torch.cuda

import wm_tokenizer
import text_utilities as tu
from HanTa import HanoverTagger as ht
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments , AutoModelForMaskedLM  # , AutoTokenizer, BertForMaskedLM
from tokenizers import pre_tokenizers

import sklearn
%load_ext autoreload
%autoreload 2

In [2]:
vd = tu.VerbDict("../new_tokenizer/fun_vocab_raw.txt", "../new_tokenizer/lex_vocab_raw.txt")

In [3]:
wmt = wm_tokenizer.WordmapTokenizer(
    bert_pretokenizer=pre_tokenizers.BertPreTokenizer(),
    bert_tokenizer=BertTokenizer.from_pretrained("bert-base-german-cased"),
    hantatagger=ht.HanoverTagger('morphmodel_ger.pgz'),
    vocab=vd.lmfm
)

In [24]:
def wm_tokenize(data):
    return wmt.wordmap2tokenizer(data['text'], pos_tag="V", vocab=wmt.vocab, pt=wmt.bert_pretokenizer, tk=wmt.bert_tokenizer, tg=wmt.hantatagger)

In [28]:

def main():
    """TODO: Fix UNK tokens bei wmt.SequenceTokenizer"""

    files =  tu.files_from_path("../data/oscar/to_lines", full_path=True)
    dataset = datasets.load_dataset("text", data_files=files[5:15], split="train")
    dataset = dataset.train_test_split(train_size=1000, test_size=150, writer_batch_size=100)
    metric = datasets.load_metric('glue', 'mrpc', keep_in_memory=True)


    tokenized_dataset = dataset.map(wm_tokenize, batched=True, batch_size=1000)

    # recommendations: https://github.com/google-research/bert
    training_args = TrainingArguments(
        output_dir='./out/model_out',  # output directory
        num_train_epochs=4,  # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./out/model_logs',  # directory for storing logs
        logging_steps=10,
        learning_rate=3e-4
    )

    model = AutoModelForSequenceClassification.from_pretrained("bert-base-german-cased")

    # model: https://huggingface.co/transformers/v4.5.1/main_classes/model.html#transformers.PreTrainedModel.resize_token_embeddings

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],  # training dataset
        eval_dataset=tokenized_dataset["test"]  # evaluation dataset
    )

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(device)
    model = trainer.model.to(device)
    model.save_pretrained("../out/model/model_out_sequence.bin")

In [29]:
main()

Using custom data configuration default-ae2f30e0ad00aa0d
Reusing dataset text (/home/gnom/.cache/huggingface/datasets/text/default-ae2f30e0ad00aa0d/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad)
Loading cached split indices for dataset at /home/gnom/.cache/huggingface/datasets/text/default-ae2f30e0ad00aa0d/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad/cache-e6cd4c9b78814303.arrow and /home/gnom/.cache/huggingface/datasets/text/default-ae2f30e0ad00aa0d/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad/cache-c6fafa6a5c5ee5e0.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Adding Erfahre to the vocabulary
Adding Abheben to the vocabulary
Adding Überschlagen to the vocabulary
Adding Abtrocknen to the vocabulary
Adding wisch to the vocabulary
Adding gesell to the vocabulary
Adding netz to the vocabulary
Adding abschlepp to the vocabulary
Adding reproduzie to the vocabulary
Adding Abi to the vocabulary
Adding superhippen to the vocabulary
Adding implantie to the vocabulary
Adding Meinst to the vocabulary
Adding ##vorg to the vocabulary
Adding andau to the vocabulary
Adding Anmischen to the vocabulary
Adding wint to the vocabulary
Adding wipp to the vocabulary
Adding dct to the vocabulary
Adding anha to the vocabulary
Adding Gönn to the vocabulary
Adding Programmieren to the vocabulary
Adding volume to the vocabulary
Adding ##wimm to the vocabulary
Adding Vermehrt to the vocabulary
Adding komprimie to the vocabulary
Adding wett to the vocabulary
Adding ##zumach to the vocabulary
Adding Eintauchen to the vocabulary
Adding ##lanc to the vocabulary
Adding anru 

  0%|          | 0/1 [00:00<?, ?ba/s]

Adding Schleich to the vocabulary
Adding vielicht to the vocabulary
Adding anla to the vocabulary
Adding ##erlei to the vocabulary
Adding myessentielleoele to the vocabulary
Adding Lache to the vocabulary
Adding Fichten to the vocabulary
Adding Aufdecken to the vocabulary
Adding geheu to the vocabulary
Adding ##einstim to the vocabulary
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-german-cased/resolve/main/config.json from cache at /home/gnom/.cache/huggingface/transformers/98877e98ee76b3977d326fe4f54bc29f10b486c317a70b6445ac19a0603b00f0.1f2afedb22f9784795ae3a26fe20713637c93f50e2c99101d952ea6476087e5e
Model config BertConfig {
  "_name_or_path": "bert-base-german-cased",
  "archit

cuda:0


Model weights saved in ../out/model/model_out_sequence.bin/pytorch_model.bin


MAIN FUNCTION BODY

In [21]:
files =  tu.files_from_path("../data/oscar/to_lines", full_path=True)
dataset = datasets.load_dataset("text", data_files=files[5:15], split="train")
dataset = dataset.train_test_split(train_size=1000, test_size=150, writer_batch_size=100)
metric = datasets.load_metric('glue', 'mrpc', keep_in_memory=True)

Using custom data configuration default-ae2f30e0ad00aa0d
Reusing dataset text (/home/gnom/.cache/huggingface/datasets/text/default-ae2f30e0ad00aa0d/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad)


In [22]:
def wm_tokenize(data):
    return wmt.wordmap2tokenizer(data['text'], pos_tag="V", vocab=wmt.vocab, pt=wmt.bert_pretokenizer, tk=wmt.bert_tokenizer, tg=wmt.hantatagger)


In [23]:
tokenized_dataset = dataset.map(wm_tokenize, batched=True, batch_size=1000)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [32]:
result = wmt.wordmap2tokenizer(dataset["train"]["text"][5:7], pos_tag="V", vocab=wmt.vocab, pt=wmt.bert_pretokenizer, tk=wmt.bert_tokenizer,
                                 tg=wmt.hantatagger)

In [1]:
#!/usr/bin/env python
__author__ = "Ricardo Jung"
__email__ = "s2458588@stud.uni-frankfurt.de"

# __copyright__ = ""
# __credits__ = [""]
# __license__ = ""
# __version__ = ""
# __maintainer__ = ""
# __status__ = ""

import datasets
import wm_tokenizer
import text_utilities as tu
from HanTa import HanoverTagger as ht
from transformers import BertTokenizer, Trainer, TrainingArguments, BertForMaskedLM, AutoModelForMaskedLM
from tokenizers import pre_tokenizers

vd = tu.VerbDict("../new_tokenizer/fun_vocab_raw.txt", "../new_tokenizer/lex_vocab_raw.txt")

wmt = wm_tokenizer.WordmapTokenizer(
    bert_pretokenizer=pre_tokenizers.BertPreTokenizer(),
    bert_tokenizer=BertTokenizer.from_pretrained("bert-base-german-cased"),
    hantatagger=ht.HanoverTagger('morphmodel_ger.pgz'),
    vocab=vd.lmfm
)


def wm_tokenize(data):
    return wmt.wordmap2tokenizer(data['text'], pos_tag="V", vocab=wmt.vocab, pt=wmt.bert_pretokenizer,
                                 tk=wmt.bert_tokenizer, tg=wmt.hantatagger)


def main():

    files = tu.files_from_path("../data/oscar/to_lines", full_path=True)
    dataset = datasets.load_dataset("text", data_files=files[5:15], split="train")
    dataset = dataset.train_test_split(train_size=1000, test_size=150, writer_batch_size=100)
    metric = datasets.load_metric('glue', 'mrpc', keep_in_memory=True)

    tokenized_dataset = dataset.map(wm_tokenize, batched=True, batch_size=1000)

    # recommendations: https://github.com/google-research/bert
    training_args = TrainingArguments(
        output_dir='./out/model_out',  # output directory
        num_train_epochs=4,  # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./out/model_logs',  # directory for storing logs
        logging_steps=10,
        learning_rate=3e-4
    )

    model = AutoModelForSequenceClassification.from_pretrained("bert-base-german-cased")

    # model: https://huggingface.co/transformers/v4.5.1/main_classes/model.html#transformers.PreTrainedModel.resize_token_embeddings

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],  # training dataset
        eval_dataset=tokenized_dataset["test"]  # evaluation dataset
    )

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = trainer.model.to(device)
    model.save_pretrained("../out/model/test_model.model")

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/gnom/builds/anaconda3/envs/tokenizer3.7/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3552, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_407648/3744538699.py", line 23, in <module>
    bert_tokenizer=BertTokenizer.from_pretrained("bert-base-german-cased"),
  File "/home/gnom/builds/anaconda3/envs/tokenizer3.7/lib/python3.7/site-packages/transformers/tokenization_utils_base.py", line 1752, in from_pretrained
    user_agent=user_agent,
  File "/home/gnom/builds/anaconda3/envs/tokenizer3.7/lib/python3.7/site-packages/transformers/utils/hub.py", line 292, in cached_path
    local_files_only=local_files_only,
  File "/home/gnom/builds/anaconda3/envs/tokenizer3.7/lib/python3.7/site-packages/transformers/utils/hub.py", line 501, in get_from_cache
    r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
  File "/home/gnom/builds/ana

TypeError: object of type 'NoneType' has no len()

In [46]:
main()

Using custom data configuration default-ae2f30e0ad00aa0d
Reusing dataset text (/home/gnom/.cache/huggingface/datasets/text/default-ae2f30e0ad00aa0d/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad)


  0%|          | 0/1 [00:00<?, ?ba/s]

Adding ##schlos to the vocabulary
Adding füh to the vocabulary
Adding wi to the vocabulary
Adding ##rd to the vocabulary
Adding ##schätzt to the vocabulary
Adding tret to the vocabulary
Adding biet to the vocabulary
Adding Erfahre to the vocabulary
Adding könn to the vocabulary
Adding ##eht to the vocabulary
Adding mach to the vocabulary
Adding steh to the vocabulary
Adding wol to the vocabulary
Adding dach to the vocabulary
Adding ##rb to the vocabulary
Adding ##steh to the vocabulary
Adding ##bind to the vocabulary
Adding Abheben to the vocabulary
Adding Überschlagen to the vocabulary
Adding vermei to the vocabulary
Adding Abtrocknen to the vocabulary
Adding wisch to the vocabulary
Adding möch to the vocabulary
Adding gesell to the vocabulary
Adding ##zuta to the vocabulary
Adding bege to the vocabulary
Adding ##rte to the vocabulary
Adding netz to the vocabulary
Adding offenba to the vocabulary
Adding Seh to the vocabulary
Adding seh to the vocabulary
Adding abschlepp to the vocabul

  0%|          | 0/1 [00:00<?, ?ba/s]

Adding Schleich to the vocabulary
Adding rutsch to the vocabulary
Adding fors to the vocabulary
Adding vielicht to the vocabulary
Adding Nachnahme to the vocabulary
Adding ##sandt to the vocabulary
Adding ##spi to the vocabulary
Adding geko to the vocabulary
Adding werf to the vocabulary
Adding gewonn to the vocabulary
Adding ##ekl to the vocabulary
Adding Tasten to the vocabulary
Adding red to the vocabulary
Adding entla to the vocabulary
Adding getrunk to the vocabulary
Adding leh to the vocabulary
Adding erhi to the vocabulary
Adding benut to the vocabulary
Adding anla to the vocabulary
Adding ##edi to the vocabulary
Adding ##gleit to the vocabulary
Adding ##erlei to the vocabulary
Adding Lasst to the vocabulary
Adding erfuh to the vocabulary
Adding myessentielleoele to the vocabulary
Adding Lache to the vocabulary
Adding Fichten to the vocabulary
Adding Aufdecken to the vocabulary
Adding ##tsch to the vocabulary
Adding bilde to the vocabulary
Adding herstel to the vocabulary
Adding