In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset

with open('./jonas.txt', 'r') as f:
    text = f.read()

ds = text.split('\n\n\n\n')

ds = [d for d in ds if len(d) > 0]

# add EOS token
ds = [d + '<|endoftext|>' for d in ds]

train_ds, test_ds = train_test_split(ds, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': train_ds})).with_format("torch")
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': test_ds})).with_format("torch")

In [None]:
type(train_dataset)

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the trained model
model_path = "../icegpt-xl/checkpoint-3000/"
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("../ice-tokenizer-large")
tokenizer.pad_token_id = tokenizer.eos_token_id

# left padding
# tokenizer.padding_side = "left"

In [None]:
# model size
parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {parameters}")

In [None]:
# add the EOS token

train_dataset_tokanized = train_dataset.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=512), batched=True)
test_dataset_tokanized = test_dataset.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=512), batched=True)

In [None]:
len(train_dataset_tokanized[1]['input_ids'])

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


In [None]:
!nvidia-smi

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="./jonas",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=10,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset_tokanized,
    eval_dataset=test_dataset_tokanized,
    data_collator=data_collator,
)

trainer.train()

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the trained model
model_path = "./jonas/pytorch_model.bin"
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("../ice-tokenizer-large")
tokenizer.pad_token_id = tokenizer.eos_token_id

# left padding
tokenizer.padding_side = "left"