# Code for continuing training of a model

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = "./models-that-work/ice-gpt2"
tokenizer_path = "./models-that-work/ice-tokenizer"

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
tokenizer.pad_token_id = tokenizer.eos_token_id

# left padding
tokenizer.padding_side = "left"


2023-08-17 11:52:14.339032: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-17 11:52:14.700653: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from datasets import load_dataset

huggingface_dataset = load_dataset("mideind/icelandic-common-crawl-corpus-IC3")

# filter out the empty strings
huggingface_dataset = huggingface_dataset.filter(lambda example: len(example["text"]) > 512)


Found cached dataset json (/home/haukur/.cache/huggingface/datasets/mideind___json/mideind--icelandic-common-crawl-corpus-IC3-15f813c6a91f241e/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/haukur/.cache/huggingface/datasets/mideind___json/mideind--icelandic-common-crawl-corpus-IC3-15f813c6a91f241e/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-6438ec45f6a1cab3.arrow
Loading cached processed dataset at /home/haukur/.cache/huggingface/datasets/mideind___json/mideind--icelandic-common-crawl-corpus-IC3-15f813c6a91f241e/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-91a30a0b68c06d2a.arrow
Loading cached processed dataset at /home/haukur/.cache/huggingface/datasets/mideind___json/mideind--icelandic-common-crawl-corpus-IC3-15f813c6a91f241e/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-8314f1c4b0df94ee.arrow


In [3]:
len(huggingface_dataset['train'][4]["text"])

595

In [4]:
from transformers import DataCollatorForLanguageModeling

context_length = 512

def tokenize_function(examples):
    outputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=context_length,
        return_length=True,
    )
    return outputs


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    # padding=True,
    pad_to_multiple_of=None,
)

# limit = 10000 # for debugging
# ds_train = huggingface_dataset["train"].select(range(int(limit)))
# ds_test = huggingface_dataset["test"].select(range(int(limit/10))))

ds_train = huggingface_dataset["train"]
ds_test = huggingface_dataset["test"]


# tokenized_dataset = huggingface_dataset.map(tokenize_function, batched=True, batch_size=1000, num_proc=4, remove_columns=["text"])
tokanized_train_dataset = ds_train.map(tokenize_function, batched=True, batch_size=1000, num_proc=4, remove_columns=["text"])
tokanized_test_dataset = ds_test.map(tokenize_function, batched=True, batch_size=1000, num_proc=4, remove_columns=["text"])



Map (num_proc=4):   0%|          | 0/1378242 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/85931 [00:00<?, ? examples/s]

In [5]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="ice-gpt2",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokanized_train_dataset,
    eval_dataset=tokanized_test_dataset,
    data_collator=data_collator,
)

trainer.train()



Step,Training Loss,Validation Loss
5000,1.0809,0.994996
10000,1.0162,0.948333


TrainOutput(global_step=10767, training_loss=1.044804362341965, metrics={'train_runtime': 16117.7994, 'train_samples_per_second': 85.511, 'train_steps_per_second': 0.668, 'total_flos': 4.507914432046694e+16, 'train_loss': 1.044804362341965, 'epoch': 1.0})

In [6]:
model.save_pretrained("ice-gpt2-common-crawl")

In [8]:
from transformers import TextGenerationPipeline

# Create a text generation pipeline
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=0)

# Generate text
# generated_text = pipeline("Fréttir", max_length=512, num_return_sequences=1, repetition_penalty=1.2, top_k=50, top_p=0.95, temperature=1.1)
# generated_text = pipeline("Það kviknaði í laugardalshöllini í gærmorgun", max_length=512, num_return_sequences=1, repetition_penalty=1.3, top_k=50, top_p=0.95, temperature=1.1)

# Print the generated text
print(generated_text[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Það kviknaði í laugardalshöllini í gærmorgun og var eftir tveggja daga brota. Hann var á leið frá Laugavegi 17, en þetta var óvenju stútfullur af þessum toga. Stútfullir af ýmiskonar verkum, sem voru á leið frá Laugavegi 18, en þar var þó ekki leyfilegur. En það var ekki lengur spurningum þeirra sem voru á leiðinni frá Laugavegi 16, en það var þó ekki á leið frá Laugavegi 18, en það var þó ekki lengur sérstaklega á leiðinni. Mynd : Kjartan Guðmundsson. Við þurfum að hafa áhyggjur af því að það verði ekki lengur s
