In [2]:
!pip install -q torch transformers datasets accelerate peft evaluate sentencepiece safetensors

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
from dataclasses import dataclass
from typing import Optional
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)

In [4]:
from datasets import Dataset
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)

In [5]:
@dataclass
class Config:
  model_name: str = "gpt2"
  output_dir: str = "./loragpt2-output"
  per_device_train_batch_size: int = 4
  num_train_epochs: int = 3
  learning_rate: float = 2e-4
  weight_decay: float = 0.0
  fp16: bool = False
  lora_r: int = 8
  lora_alpha: int = 32
  lora_dropout: float = 0.1

  max_seq_length: int = 256


cfg = Config()

In [6]:
texts = [
    "Hello, my name is Sanoj and I love cats.",
    "Weather today: sunny with a chance of learning.",
    "Data science is about asking the right questions and checking assumptions.",
    "Fine-tuning language models with LoRA can be fast and cheap if done correctly."
]

In [7]:
dataset = Dataset.from_dict({"text": texts})

In [8]:
dataset

Dataset({
    features: ['text'],
    num_rows: 4
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [10]:
if tokenizer.pad_token_id is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [11]:
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '[PAD]'}, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [12]:
model = AutoModelForCausalLM.from_pretrained(cfg.model_name)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [13]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [14]:
model.get_input_embeddings().weight.shape[0]

50257

In [15]:
if tokenizer.pad_token_id is not None and model.get_input_embeddings().weight.shape[0]!= len(tokenizer):
  model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [16]:
lora_config = LoraConfig(
    r = cfg.lora_r,
    lora_alpha = cfg.lora_alpha,
    target_modules = ["c_aatn","c_proj"],
    lora_dropout= cfg.lora_dropout,
    bias = 'none',
    task_type = 'CAUSAL_LM'
)

In [17]:
model = get_peft_model(model, lora_config)



In [18]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50258, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Conv1D(nf=2304, nx=768)
              (c_proj): lora.Linear(
                (base_layer): Conv1D(nf=768, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=768, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (

In [19]:
def tokenize(example):
  return tokenizer(example['text'], truncation=True, max_length= cfg.max_seq_length, padding="max_length")

In [20]:
tokenized = dataset.map(tokenize, batched=True, remove_columns=['text'])

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [21]:
tokenized

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 4
})

In [22]:
tokenized =  tokenized.map(lambda ex: {
    "labels": ex["input_ids"]
}, batched= False)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [23]:
tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4
})

In [None]:
tokenized['labels']

In [26]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm=False
)

In [27]:
training_args = TrainingArguments(
    output_dir = cfg.output_dir,
    num_train_epochs = cfg.num_train_epochs,
    per_device_train_batch_size = cfg.per_device_train_batch_size,
    learning_rate = cfg.learning_rate,
    weight_decay = cfg.weight_decay,
    fp16 = cfg.fp16,
    logging_steps = 10,
    save_total_limit = 2,
    save_strategy = "epoch",
    push_to_hub = False,
    report_to = "none"
)

In [28]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized,
    data_collator = data_collator,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss




TrainOutput(global_step=3, training_loss=4.398261070251465, metrics={'train_runtime': 13.784, 'train_samples_per_second': 0.871, 'train_steps_per_second': 0.218, 'total_flos': 1577264873472.0, 'train_loss': 4.398261070251465, 'epoch': 3.0})

In [29]:
os.makedirs(cfg.output_dir, exist_ok=True)
model.save_pretrained(cfg.output_dir)
tokenizer.save_pretrained(cfg.output_dir)
print(f"Finished. Saved LoRA adapters and tokenizer to {cfg.output_dir}")




Finished. Saved LoRA adapters and tokenizer to ./loragpt2-output


In [31]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel


base = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
base.resize_token_embeddings(len(tokenizer))

model = PeftModel.from_pretrained(base, cfg.output_dir)

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [36]:
#inference
input_text = "Data science means"
inputs = tokenizer(input_text, return_tensors = "pt")
with torch.no_grad():
  out = model.generate(**inputs, max_length = 50)
print(tokenizer.decode(out[0],skip_special_tokens=True ))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Data science means that we can use the data to understand the world around us.

The data science approach is based on the idea that we can use data to understand the world around us.

The data science approach is based on the idea
