In [3]:
! pip install transformers
! pip install torch
! pip install plotly
! pip install nbformat
! pip install accelerate
! pip install datasets
! pip install sentence_transformers

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets)
  Using cached fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.10.9-cp310-cp310-macosx_11_0_arm64.whl.metadata (7.6 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohap

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('hakurei/open-instruct-v1', split="train")

In [3]:
df = dataset.to_pandas()
df.sample(10)

Unnamed: 0,output,input,instruction
419279,The specific volume of a vapor can be used to ...,,How to know if a vapor is saturated or superhe...
125488,The movie was too confusing for me to understa...,,Give an example of something you did not like ...
428057,Once upon a time in the land of the rising sun...,,write an epic ballad about a ninja muffin
464499,"Overall, this essay effectively showcases the ...",,Is this a good supplemental essay for college ...
152998,An effective marketing strategy for an Ad camp...,,What marketing strategies should I use to crea...
410668,JavaScript,,Find out which language is used in each line o...
159345,The Coronavirus pandemic has had a devastating...,,How did the Coronavirus pandemic affect the to...
35608,Apple Cinnamon Muffins: Preheat the oven to 37...,,Give an example of a recipe involving an apple.
464049,| Keyword Cluster | Keyword | Search Intent | ...,,Landing Page
296376,Turkey is one of the countries that borders th...,,Name a country that borders the Mediterranean ...


In [3]:
def preprocess(example):
    example['prompt'] = f"{example['instruction']} {example['input']} {example['output']}"
    return example


def tokenize_dataset(dataset, tokenizer):
    return dataset.map(lambda e: tokenizer(e["prompt"], truncation=True, max_length=128), batched=True, remove_columns=["prompt"])

In [4]:
dataset = dataset.map(preprocess, remove_columns=["instruction", "input", "output"])
dataset = dataset.shuffle().select(range(500)).train_test_split(test_size=0.1)
train_dataset = dataset["train"]
test_dataset = dataset["test"]
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt'],
        num_rows: 450
    })
    test: Dataset({
        features: ['prompt'],
        num_rows: 50
    })
})

In [5]:
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [6]:
train_dataset = tokenize_dataset(train_dataset, tokenizer)
test_dataset = tokenize_dataset(test_dataset, tokenizer)

Map: 100%|██████████| 450/450 [00:00<00:00, 4357.85 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 3985.31 examples/s]


In [7]:
training_args = TrainingArguments(
    output_dir = "./stevengpt-instruct",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16
)

In [8]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    data_collator = data_collator
)

In [9]:
trainer.train()  

100%|██████████| 57/57 [5:13:03<00:00, 329.53s/it]  

{'train_runtime': 18783.2855, 'train_samples_per_second': 0.024, 'train_steps_per_second': 0.003, 'train_loss': 3.965287057976974, 'epoch': 1.0}





TrainOutput(global_step=57, training_loss=3.965287057976974, metrics={'train_runtime': 18783.2855, 'train_samples_per_second': 0.024, 'train_steps_per_second': 0.003, 'total_flos': 100829324623872.0, 'train_loss': 3.965287057976974, 'epoch': 1.0})

In [None]:
tuned_model = AutoModelForCausalLM.from_pretrained("./stevengpt-instruct")

In [None]:
prompt = "List 5 african countries"
inputs = tokenizer.encode(prompt, return_tensors="pt").to('cuda')
outputs = model.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_text[:generated_text.rfind('.')+1]