## Import Libraries

In [24]:
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    set_seed,
    Trainer
)
from datasets import load_dataset
import wandb

## Setting Up

In [28]:
# wandb.login(anonymous="allow")

In [4]:
model_checkpoint = "roneneldan/TinyStories-33M"

## Preparaing Data

In [5]:
ds = load_dataset('MohamedRashad/characters_backstories')

In [6]:
ds["train"][400]

{'text': 'Generate Backstory based on following information\nCharacter Name: Dewin \nCharacter Race: Halfling\nCharacter Class: Sorcerer bard\n\nOutput:\n',
 'target': 'Dewin thought he was a wizard, but it turned out it was the draconic blood in his veins that brought him eldritch power.  Music classes in wizarding college taught him yet another use for his power, and when he was expelled he took up adventuring'}

In [7]:
ds = ds["train"].train_test_split(test_size=0.2, seed=42)

## Model Checkpoint

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=False)

In [10]:
tokenizer.pad_token = tokenizer.eos_token

In [11]:
def tokenize_function(example):
    merged = example["text"] + " " + example["target"]
    batch = tokenizer(merged, padding='max_length', truncation=True, max_length=128)
    batch["labels"] = batch["input_ids"].copy()
    return batch

In [12]:
tokenized_datasets = ds.map(tokenize_function, remove_columns=["text", "target"])

In [13]:
print(tokenizer.decode(tokenized_datasets["train"][900]['input_ids']))

Generate Backstory based on following information
Character Name: Mr. Gale
Character Race: Half-orc
Character Class: Cleric

Output:
 Growing up the only half-orc in a small rural town was rough. His mother didn't survive childbirth and so was raised in a church in a high mountain pass, his attention was always drawn by airships passing through, and dreams of an escape. Leaving to strike out on his own as early as he could he made a living for most of his life as an airship sailor, and occasionally a pirate. A single storm visits him throughout his life, marking every major


## Training

In [15]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint);

In [29]:
# run = wandb.init(project='llm_finetuning', job_type="training", anonymous="allow")

In [17]:
model_name = model_checkpoint.split("/")[-1]

In [20]:
training_args = TrainingArguments(
    f"{model_name}-finetuned-characters-backstories",
    report_to="wandb", # we need one line to track experiments in wandb
    num_train_epochs=1,
    logging_steps=1,
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    no_cuda=True, # force cpu use, will be renamed `use_cpu`
)

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,4.5971,3.353577


TrainOutput(global_step=233, training_loss=3.721863737433765, metrics={'train_runtime': 266.2794, 'train_samples_per_second': 6.974, 'train_steps_per_second': 0.875, 'total_flos': 40423258718208.0, 'train_loss': 3.721863737433765, 'epoch': 1.0})

In [25]:
transformers.logging.set_verbosity_error()

prefix = "Generate Backstory based on following information Character Name: "

prompts = [
    "Frogger Character Race: Aarakocra Character Class: Ranger Output: ",
    "Smarty Character Race: Aasimar Character Class: Cleric Output: ",
    "Volcano Character Race: Android Character Class: Paladin Output: ",
]

table = wandb.Table(columns=["prompt", "generation"])

for prompt in prompts:
    input_ids = tokenizer.encode(prefix + prompt, return_tensors="pt")
    output = model.generate(input_ids, do_sample=True, max_new_tokens=50, top_p=0.3)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    table.add_data(prefix + prompt, output_text)

In [30]:
# wandb.log({'tiny_generations': table})

In [31]:
# wandb.finish()