In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
import transformers
transformers.set_seed(42)
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# wandb.login(anonymous="allow")

In [3]:
model_checkpoint = "roneneldan/TinyStories-33M"

## Preparing Data

In [4]:
data = load_dataset('MohamedRashad/characters_backstories')

In [5]:
data["train"][400]

{'text': 'Generate Backstory based on following information\nCharacter Name: Dewin \nCharacter Race: Halfling\nCharacter Class: Sorcerer bard\n\nOutput:\n',
 'target': 'Dewin thought he was a wizard, but it turned out it was the draconic blood in his veins that brought him eldritch power.  Music classes in wizarding college taught him yet another use for his power, and when he was expelled he took up adventuring'}

In [6]:
data = data["train"].train_test_split(test_size=0.2, seed=42)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    merged = example["text"] + " " + example["target"]
    batch = tokenizer(merged, padding='max_length', truncation=True, max_length=128)
    batch["labels"] = batch["input_ids"].copy()
    return batch

tokenized_datasets = data.map(tokenize_function, remove_columns=["text", "target"])

In [8]:
print(tokenizer.decode(tokenized_datasets["train"][900]['input_ids']))

Generate Backstory based on following information
Character Name: Mr. Gale
Character Race: Half-orc
Character Class: Cleric

Output:
 Growing up the only half-orc in a small rural town was rough. His mother didn't survive childbirth and so was raised in a church in a high mountain pass, his attention was always drawn by airships passing through, and dreams of an escape. Leaving to strike out on his own as early as he could he made a living for most of his life as an airship sailor, and occasionally a pirate. A single storm visits him throughout his life, marking every major


## Training

In [9]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint);

In [18]:
# run = wandb.init(project='language_model_finetuning', job_type="finetuning", anonymous="allow")

In [11]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-characters-backstories",
    report_to="wandb",
    num_train_epochs=1,
    logging_steps=1,
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    no_cuda=True,
)



In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.8127,3.356489


TrainOutput(global_step=233, training_loss=3.746427840940942, metrics={'train_runtime': 933.7233, 'train_samples_per_second': 1.989, 'train_steps_per_second': 0.25, 'total_flos': 40423258718208.0, 'train_loss': 3.746427840940942, 'epoch': 1.0})

In [14]:
transformers.logging.set_verbosity_error()

prefix = "Generate Backstory based on following information Character Name: "

prompts = [
    "Frogger Character Race: Aarakocra Character Class: Ranger Output: ",
    "Smarty Character Race: Aasimar Character Class: Cleric Output: ",
    "Volcano Character Race: Android Character Class: Paladin Output: ",
]

table = wandb.Table(columns=["prompt", "generation"])

for prompt in prompts:
    input_ids = tokenizer.encode(prefix + prompt, return_tensors="pt")
    output = model.generate(input_ids, do_sample=True, max_new_tokens=50, top_p=0.3)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    table.add_data(prefix + prompt, output_text)

In [19]:
# wandb.log({'tiny_generations': table})

In [20]:
# wandb.finish()