In [28]:
import os
from tokenizers import ByteLevelBPETokenizer

all_paths = []
for root, dirs, files in os.walk("./processed_data"):
    for file in files:
        if file.endswith(".txt"):
             all_paths.append(os.path.join(root, file))

# # initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# train it on the files
tokenizer.train_from_iterator(all_paths, vocab_size=30000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
    ])







In [29]:
# create a directory to save the tokenizer files
os.makedirs("./ice-tokenizer", exist_ok=True)

# save it
tokenizer.save_model("./ice-tokenizer/")

['./ice-tokenizer/vocab.json', './ice-tokenizer/merges.txt']

In [30]:
from tokenizers import ByteLevelBPETokenizer

# Load the tokenizer from the saved model directory
tokenizer = ByteLevelBPETokenizer.from_file("./ice-tokenizer/vocab.json", "./ice-tokenizer/merges.txt")

In [31]:
tokenizer.encode('Hvað er að frétta 😁 ?').tokens

['H',
 'v',
 'a',
 'Ã',
 '°',
 'Ġ',
 'e',
 'r',
 'Ġ',
 'a',
 'Ã',
 '°',
 'Ġ',
 'fr',
 'Ã',
 '©',
 't',
 'ta',
 'Ġ',
 'ð',
 'Ł',
 'ĺ',
 'ģ',
 'Ġ',
 '?']

In [32]:
import os

all_paths = []
for root, dirs, files in os.walk("./processed_data"):
    for file in files:
        if file.endswith(".txt"):
             all_paths.append(os.path.join(root, file))


from datasets import load_dataset

portion = 0.01
all_paths = all_paths[:int(len(all_paths) * portion)]
# dataset = load_dataset('text', data_files=all_paths)

# train and test split
dataset = load_dataset('text', data_files=all_paths, split=['train[:80%]', 'train[80%:]'])

train_dataset = dataset[0]
test_dataset = dataset[1]


Resolving data files:   0%|          | 0/16842 [00:00<?, ?it/s]

Found cached dataset text (/home/haukur/.cache/huggingface/datasets/text/default-86be84e4b6fda226/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/2 [00:00<?, ?it/s]

In [33]:

context_length = 1024

def tokenize_function(examples):
    tokenizer.enable_truncation(max_length=context_length)
    tokenizer.enable_padding(length=context_length)

    tokenized_examples = tokenizer.encode_batch(examples["text"])

    # result = {"input_ids": [tokenized_example.ids for tokenized_example in tokenized_examples], "attention_mask": [tokenized_example.attention_mask for tokenized_example in tokenized_examples]}

    # to save space, we only save the input_ids
    result = {"input_ids": [tokenized_example.ids for tokenized_example in tokenized_examples]}
    return result

tokanized_train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=1000, remove_columns=["text"])
tokanized_test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=1000, remove_columns=["text"])


Loading cached processed dataset at /home/haukur/.cache/huggingface/datasets/text/default-86be84e4b6fda226/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-4528f20a85756126.arrow
Loading cached processed dataset at /home/haukur/.cache/huggingface/datasets/text/default-86be84e4b6fda226/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-ecc09ff14ac58a69.arrow


In [34]:
tokanized_train_dataset, train_dataset

(Dataset({
     features: ['input_ids'],
     num_rows: 13474
 }),
 Dataset({
     features: ['text'],
     num_rows: 13474
 }))

In [35]:
tokanized_train_dataset[0]

{'input_ids': [42,
  86,
  132,
  107,
  88,
  366,
  225,
  45,
  82,
  387,
  88,
  225,
  38,
  132,
  104,
  306,
  86,
  17,
  225,
  291,
  225,
  87,
  90,
  73,
  77,
  261,
  86,
  74,
  132,
  107,
  80,
  132,
  119,
  75,
  225,
  225,
  225,
  225,
  58,
  73,
  75,
  87,
  80,
  132,
  116,
  132,
  113,
  77,
  225,
  90,
  73,
  86,
  132,
  113,
  89,
  86,
  225,
  80,
  69,
  75,
  132,
  113,
  89,
  86,
  225,
  89,
  84,
  84,
  225,
  47,
  89,
  70,
  70,
  69,
  16,
  225,
  715,
  80,
  80,
  225,
  132,
  260,
  225,
  70,
  83,
  88,
  82,
  77,
  225,
  55,
  401,
  88,
  89,
  80,
  87,
  715,
  86,
  132,
  113,
  326,
  16,
  225,
  83,
  74,
  427,
  225,
  44,
  83,
  80,
  261,
  76,
  90,
  73,
  86,
  438,
  225,
  132,
  99,
  225,
  132,
  240,
  87,
  69,
  547,
  132,
  113,
  77,
  18,
  225,
  38,
  132,
  104,
  306,
  86,
  342,
  78,
  132,
  116,
  86,
  82,
  225,
  132,
  240,
  87,
  69,
  715,
  86,
  132,
  113,
  326,
  70,
  132,
  

In [36]:
from transformers import GPT2Config, GPT2LMHeadModel

# Initializing a GPT2 configuration
configuration = GPT2Config(vocab_size=30000, 
                           n_ctx=context_length, 
                        #    bos_token_id=0, 
                        #    eos_token_id=0, 
                        #    pad_token_id=0, 
                           n_positions=context_length)

# Initializing a model from the configuration
model = GPT2LMHeadModel(configuration)

# Accessing the model configuration
configuration = model.config

In [37]:
model.config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.30.1",
  "use_cache": true,
  "vocab_size": 30000
}

In [38]:
model_size = sum(t.numel() for t in model.parameters())
print(f"ICE GPT-2 size: {model_size/1000**2:.1f}M parameters")

ICE GPT-2 size: 108.9M parameters


In [39]:
from transformers import DataCollatorForLanguageModeling

tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)

In [40]:
from transformers import Trainer, TrainingArguments

from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="ice-gpt2",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokanized_train_dataset,
    eval_dataset=tokanized_test_dataset,
    data_collator=data_collator,
)

trainer.train()



AttributeError: 'ByteLevelBPETokenizer' object has no attribute 'pad'

In [45]:
import torch
from torch.utils.data import DataLoader

# Define your training parameters
epochs = 5
batch_size = 32
learning_rate = 5e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create DataLoader for your training dataset
train_dataloader = DataLoader(tokanized_train_dataset, batch_size=batch_size, shuffle=True)

# Set up your model and optimizer
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        inputs = {"input_ids": batch["input_ids"], "labels": batch["input_ids"]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}: Average Loss = {average_loss}")


AttributeError: 'list' object has no attribute 'to'