In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset

dataset = load_dataset('hakurei/open-instruct-v1', split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/104M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/22.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/109M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/498813 [00:00<?, ? examples/s]

GPT is decoder only model, i.e. it does not take input, but only produce output. So we give it prompt and let it continue from the prompt.

During training, we feed it the prompt and mask the ending tokens, then let the model complete it token by token.

In [None]:
def preprocess(example):
    """preprocess a row from the dataset
    """
    example['prompt'] = f"{example['instruction']} {example['input']} {example['output']}"
    return example
​
def tokenize_dataset(dataset):
    tokenized_dataset = dataset.map(lambda example: tokenizer(example['prompt'], truncation=True, max_length=128),
                                   batched=True, remove_columns=['prompt'])
    return tokenized_dataset

In [None]:
# this dataset format is used to train a lot of chatGPT variants
# this is to teach the model to follow the instructions to product output
dataset

In [None]:
dataset.to_pandas().sample(20)

In [None]:
dataset = dataset.map(preprocess, remove_columns=['instruction', 'input', 'output'])

In [None]:
dataset.to_pandas().sample(20)

In [None]:
dataset = dataset.shuffle(seed=42).select(range(100000)).train_test_split(test_size=0.1)
train_dataset = dataset['train']
test_dataset = dataset['test']
# this is a model half-trained, still bad at answering questions. We will train it to answer better.
MODEL_NAME = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# GPT was trained using sliding windows, so it does not need padding. So no padding token.
# But we need to keep our instruction and context separate, so we need padding and have to define
# the padding token.
tokenizer.pad_token = tokenizer.eos_token

train_dataset = tokenize_dataset(train_dataset)
test_dataset = tokenize_dataset(test_dataset)

In [None]:
train_dataset, test_dataset

In [None]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

In [None]:
# used for masking tokens, batching data, etc.
# we specify a tokenizer for the padding.
# we are using CausalLM not MaskingLM, so mlm=False
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir = './dialoGPT2-instruct',
    num_train_epochs = 1,
    per_device_train_batch_size=8, # normally set it as big as possible without crashing
    per_device_eval_batch_size=16,
)

trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    data_collator = data_collator
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
model = AutoModelForCausalLM.from_pretrained('./dialoGPT2-instruct').to('cuda')
prompt = ''
def generate_text(prompt):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to('cuda')
    outputs = model.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # remove any words after the last full-stop
    return generated_text[:generated_text.rfind('.')+1]

In [None]:
generate_text("How to get to Central from Hurstville using public transport in Sydney, Australia?")