In [1]:
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datasets = load_dataset("gsm8k", "main")
datasets["train"][0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

# Tokenize Dataset

In [3]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained("facebook/layerskip-llama2-7B")


In [4]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [5]:
tokenizer("Where are the plants?")

{'input_ids': [1, 6804, 526, 278, 18577, 29973], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [26]:
# find the maximum length of a question
max_len_q = 0
for example in datasets["train"]:
    max_len_q = max(len(example['question']), max_len_q)
for example in datasets["test"]:
    max_len_q = max(len(example['question']), max_len_q)
max_len_q

985

In [24]:
max_len_ans = 0
for example in datasets["train"]:
    max_len_ans = max(len(example['answer']), max_len_ans)
for example in datasets["test"]:
    max_len_ans = max(len(example['answer']), max_len_ans)
max_len_ans

1228

In [25]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [28]:
max_length = 1024 # The maximum length of a feature (question and context)


def prepare_train_features(example):
    text_column = "question"
    label_column = "answer"
    batch_size = len(example[text_column])
    inputs = f"{text_column} : {example[text_column]} Label : "
    targets = example[label_column]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs

    sample_input_ids = model_inputs["input_ids"]
    label_input_ids = labels["input_ids"] + [tokenizer.eos_token_id]
    # print(i, sample_input_ids, label_input_ids)
    model_inputs["input_ids"] = sample_input_ids + label_input_ids
    labels["input_ids"] = [-100] * len(sample_input_ids) + label_input_ids
    model_inputs["attention_mask"] = [1] * len(model_inputs["input_ids"])
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
features = prepare_train_features(datasets['train'][0])
features.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [32]:
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup


tokenized_train_dataset = datasets["train"].map(prepare_train_features, remove_columns=datasets["train"].column_names)
train_dataloader = DataLoader(tokenized_train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=16)

In [33]:
for ex in train_dataloader:
    print(ex)
    break

ValueError: expected sequence of length 151 at dim 1 (got 121)

In [14]:
max_length = 1024 # The maximum length of a feature (question and context)
import torch

# def prepare_test_features(example):
#     text_column = "question"
#     batch_size = len(example[text_column])
#     inputs = f"{text_column} : {example[text_column]} Label : "
#     model_inputs = tokenizer(inputs)

#     sample_input_ids = model_inputs["input_ids"]
#     # print(i, sample_input_ids, label_input_ids)
#     model_inputs["input_ids"] = sample_input_ids
#     model_inputs["attention_mask"] = [1] * len(model_inputs["input_ids"])
    
#     model_inputs["input_ids"] = torch.tensor(model_inputs["input_ids"])
#     model_inputs["attention_mask"] = torch.tensor(model_inputs["attention_mask"])
#     model_inputs["labels"] = labels["input_ids"]

    
#     return model_inputs
# features = prepare_test_features(datasets['test'][0])



In [15]:
from torch.utils.data import DataLoader

tokenized_test_dataset = datasets["test"].map(prepare_train_features, remove_columns=datasets["test"].column_names)
test_dataloader = DataLoader(tokenized_test_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=1)

In [22]:
for ex in test_dataloader:
    # print(ex)
    q = tokenizer.batch_decode(ex['input_ids'].detach().cpu().numpy(), skip_special_tokens=True)
    print(q)
    ans = tokenizer.batch_decode(ex['labels'].detach().cpu().numpy(), skip_special_tokens=True)
    print(ans)
    break

["question : Brian's friend Bobby has 5 fewer than 3 times as many video games as Brian does.  If Brian has 20 video games but lost 5 right before the comparison was made, how many does Bobby have? Label :  If Brian previously had 20 video games but lost 5, that means he now has 20-5=<<20-5=15>>15 video games\nBrian has 15 video games, so if Bobby has 5 fewer than 3 times as many as Brian does we must first perform 15*3=45\nWe then subtract 5 from the previous total for 45-5=40 games in total.\n#### 40"]


OverflowError: out of range integral type conversion attempted

In [126]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

import torch
model = AutoModelForCausalLM.from_pretrained(
        "facebook/layerskip-llama2-7B",
        use_safetensors=True,
        device_map="auto",
        torch_dtype=torch.float16)

Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.71s/it]


In [127]:
from peft import get_peft_config, get_peft_model, PrefixTuningConfig, TaskType, PeftType
peft_config = PrefixTuningConfig(task_type=TaskType.CAUSAL_LM, num_virtual_tokens=30)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


trainable params: 7,864,320 || all params: 6,746,279,936 || trainable%: 0.1166


In [128]:
from transformers import default_data_collator, get_linear_schedule_with_warmup
lr = 3e-2
num_epochs = 5
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [129]:
len(train_dataloader)

7473

In [130]:
from tqdm import tqdm
# training and evaluation
device = "cuda"
model = model.to("cuda")
model = model.train()

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        #         print(batch["input_ids"].shape)
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(test_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(test_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 7473/7473 [05:46<00:00, 21.58it/s]
100%|██████████| 1319/1319 [00:24<00:00, 54.83it/s]


epoch=0: train_ppl=tensor(1.9895, device='cuda:0') train_epoch_loss=tensor(0.6879, device='cuda:0') eval_ppl=tensor(1.7078, device='cuda:0') eval_epoch_loss=tensor(0.5352, device='cuda:0')


100%|██████████| 7473/7473 [05:46<00:00, 21.55it/s]
100%|██████████| 1319/1319 [00:24<00:00, 54.90it/s]


epoch=1: train_ppl=tensor(1.6220, device='cuda:0') train_epoch_loss=tensor(0.4836, device='cuda:0') eval_ppl=tensor(1.5768, device='cuda:0') eval_epoch_loss=tensor(0.4554, device='cuda:0')


100%|██████████| 7473/7473 [05:46<00:00, 21.56it/s]
100%|██████████| 1319/1319 [00:24<00:00, 54.93it/s]


epoch=2: train_ppl=tensor(1.5395, device='cuda:0') train_epoch_loss=tensor(0.4315, device='cuda:0') eval_ppl=tensor(1.5404, device='cuda:0') eval_epoch_loss=tensor(0.4320, device='cuda:0')


100%|██████████| 7473/7473 [05:49<00:00, 21.39it/s]
100%|██████████| 1319/1319 [00:23<00:00, 54.96it/s]


epoch=3: train_ppl=tensor(1.4840, device='cuda:0') train_epoch_loss=tensor(0.3947, device='cuda:0') eval_ppl=tensor(1.5091, device='cuda:0') eval_epoch_loss=tensor(0.4115, device='cuda:0')


100%|██████████| 7473/7473 [05:46<00:00, 21.56it/s]
100%|██████████| 1319/1319 [00:24<00:00, 54.91it/s]


epoch=4: train_ppl=tensor(1.3991, device='cuda:0') train_epoch_loss=tensor(0.3359, device='cuda:0') eval_ppl=tensor(1.4945, device='cuda:0') eval_epoch_loss=tensor(0.4018, device='cuda:0')


In [131]:
peft_model_id = "llama_finetuned_gsm8k"
model.save_pretrained(peft_model_id)

# Load

In [1]:
from peft import PeftModel, PeftConfig

model_id = "llama_finetuned_gsm8k"
config = PeftConfig.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, model_id)

  from .autonotebook import tqdm as notebook_tqdm


In [140]:
device = "cuda"
model.to(device)
model.eval()
i = 4
inputs = tokenizer( f"question : {example['question']} Label : ", return_tensors="pt")
print(datasets["test"][i]["question"])
print(inputs)

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
    )
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))
    print(datasets['test'][i]['answer'])

Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds, mealworms and vegetables to help keep them healthy.  She gives the chickens their feed in three separate meals. In the morning, she gives her flock of chickens 15 cups of feed.  In the afternoon, she gives her chickens another 25 cups of feed.  How many cups of feed does she need to give her chickens in the final meal of the day if the size of Wendi's flock is 20 chickens?
{'input_ids': tensor([[    1,  1139,   584,  6498,   322, 29871, 29941,   310,   670,  7875,
          1797, 29871, 29955,   282,  4981,   294,   363,   301,  3322, 29889,
          7806,   282, 24990,   338,  5700,   964, 29871, 29947,   269, 29399,
         29889,   960,  6498,   322,   670,  7875,   864,   304,  6232,   278,
           282,  4981,   294, 18018, 29892,   920,  1784,   269, 29399,   508,
          1269,   310,   963,   505, 29973, 15796,   584, 29871]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 



In [145]:
for ex in train_dataloader:
    print(ex)
    q = tokenizer.batch_decode(ex['input_ids'].detach().cpu().numpy(), skip_special_tokens=True)
    print(q)
    ans = tokenizer.batch_decode(ex['labels'].detach().cpu().numpy(), skip_special_tokens=True)
    print(ans)
    break

{'input_ids': tensor([[    1,  1139,   584,  7991,  4687,   714,   278,  4723,   411,   395,
         29947, 29900, 29889,  1551, 27822,  7432, 29892,   540, 10398,  4203,
           278,  6909, 29889,  1551,   323,  1041,  3250, 29892,   540, 10398,
           697, 29899, 28491,   386,   310,   278,  5253,  2175,   515, 27822,
         29889,  1551, 15050,  4515,  3250, 29892,   540, 10398, 29871, 29941,
         29914, 29947,   386, 29879,   310,   278,  5253,  2175,   515,   323,
          1041,  3250, 29889,  1128,  1568,   947,   540,   505,  2175,  1286,
         29973, 15796,   584, 29871,  1551, 27822, 29892,   540, 10398,  4203,
           310,   395, 29947, 29900, 10124,  1075,   411,   395, 29947, 29900,
         17722, 29896, 29914, 29906, 11877, 29938, 29947, 29900,   353,   395,
          9314, 29947, 29900, 17722, 29896, 29914, 29906, 11877, 29947, 29900,
         29922, 29946, 29900,  6778, 29946, 29900,    13,  2951,   323,  1041,
          3250, 29892,   540, 10398, 2

OverflowError: out of range integral type conversion attempted