# Introduction

* Datasets:
    * https://huggingface.co/datasets/tatsu-lab/alpaca?row=1
* Models:
    * https://huggingface.co/facebook/opt-125m
 
***Note:*** *Here we will manually preprocess the input before feeding it to the model. We use `formatting_func` in the SFT API.*

In [1]:
!pip install -U accelerate peft bitsandbytes transformers trl datasets



In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

## Configuration

In [3]:
batch_size = 2
num_workers = os.cpu_count()
# max_steps = -1 for epoch-wise training.
# epochs = -1 for step-wise training.
# Both cannot be -1.
max_steps = -1
epochs = 3
bf16 = True
fp16 = False
gradient_accumulation_steps = 16
seq_length = 1024
logging_steps = 50
save_steps = 50
learning_rate = 0.0002
model_name = 'facebook/opt-125m'
out_dir = 'outputs/opt_125m_alpaca_sft'

## Load Dataset

In [4]:
dataset = load_dataset('tatsu-lab/alpaca')

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})


In [6]:
print(dataset['train']['text'][0])

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Give three tips for staying healthy.

### Response:
1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 
2. Exercise regularly to keep your body active and strong. 
3. Get enough sleep and maintain a consistent sleep schedule.


In [7]:
print(dataset['train'][0])

{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}


In [8]:
full_dataset = dataset['train'].train_test_split(test_size=0.05, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']
 
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 49401
})
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 2601
})


In [9]:
for i in range(10):
    print(dataset_train[i])
    print('****************')
    
    text = dataset_train[i]
    instruction = '### Instruction:\n' + text['instruction']
    inputs = '\n\n### Input:\n' + text['input']
    response = '\n\n### Response:\n' + text['output']
    
    final_text = instruction + inputs + response
    print(final_text)
    print('#'*50)

{'instruction': 'Write a function that takes an array of numbers and sorts them in ascending order.', 'input': '[5, 2, 9, 7, 4, 1]', 'output': 'function sortAscending(arr) { \n    return arr.sort((a, b) => a-b); \n};', 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWrite a function that takes an array of numbers and sorts them in ascending order.\n\n### Input:\n[5, 2, 9, 7, 4, 1]\n\n### Response:\nfunction sortAscending(arr) { \n    return arr.sort((a, b) => a-b); \n};'}
****************
### Instruction:
Write a function that takes an array of numbers and sorts them in ascending order.

### Input:
[5, 2, 9, 7, 4, 1]

### Response:
function sortAscending(arr) { 
    return arr.sort((a, b) => a-b); 
};
##################################################
{'instruction': 'Describe one popular feature of the latest iOS operating system.', 'input': '', '

In [10]:
def preprocess_function(example):
    """
    Formatting function returning a list of samples (kind of necessary for SFT API).
    """
    text = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    return text

## Model

In [11]:
if bf16:
    model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)

  return self.fget.__get__(instance, owner)()


In [12]:
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), ep

## Tokenizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
    use_fast=False
)

## Training

In [14]:
if max_steps == -1 and epochs > 0:
    training_args = TrainingArguments(
        output_dir=f"{out_dir}/logs",
        evaluation_strategy='epoch',
        weight_decay=0.01,
        load_best_model_at_end=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_strategy='steps',
        save_strategy='epoch',
        logging_steps=logging_steps,
        num_train_epochs=epochs,
        save_total_limit=2,
        bf16=bf16,
        fp16=fp16,
        report_to='tensorboard',
        dataloader_num_workers=num_workers,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        lr_scheduler_type='constant',
    )

if max_steps > 0 and epochs == -1:
    training_args = TrainingArguments(
        output_dir=f"{out_dir}/logs",
        evaluation_strategy='steps',
        weight_decay=0.01,
        load_best_model_at_end=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_strategy='steps',
        save_strategy='steps',
        logging_steps=logging_steps,
        save_steps=save_steps,
        save_total_limit=2,
        bf16=bf16,
        fp16=fp16,
        report_to='tensorboard',
        max_steps=max_steps,
        dataloader_num_workers=num_workers,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        lr_scheduler_type='constant',
    )

In [15]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    max_seq_length=seq_length,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=preprocess_function,
    packing=True
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
dataloader = trainer.get_train_dataloader()
for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break

### Instruction:
Take the given sentence and sort its words in alphabetical order.

### Input:
Sentence: The world is a beautiful place

### Response:
A beautiful place The world is.</s></s>### Instruction:
Compare a laptop and a smartphone.

### Input:


### Response:
A laptop is typically larger than a smartphone, and has more storage, processing power, and features such as a built-in keyboard, mouse, and larger screen. Smartphones are more portable, have longer battery life, and have access to a variety of applications depending on the phone.</s></s>### Instruction:
Describe a moral problem related to AI.

### Input:


### Response:
One moral problem related to AI is the potential for AI systems to be biased against certain demographics. AI systems often rely on training data that is not representative of real-world demographics, which can lead to poor outcomes when applied to a more diverse population. This can lead to ethical issues related to fairness, accuracy, privacy, and safe

In [17]:
history = trainer.train()

Epoch,Training Loss,Validation Loss
0,1.8892,1.813107
1,1.6901,1.782268
2,1.5633,1.77977


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [18]:
model.save_pretrained(f"{out_dir}/best_model")
tokenizer.save_pretrained(f"{out_dir}/best_model")

('outputs/opt_125m_alpaca_sft/best_model/tokenizer_config.json',
 'outputs/opt_125m_alpaca_sft/best_model/special_tokens_map.json',
 'outputs/opt_125m_alpaca_sft/best_model/vocab.json',
 'outputs/opt_125m_alpaca_sft/best_model/merges.txt',
 'outputs/opt_125m_alpaca_sft/best_model/added_tokens.json')

## Inference

In [1]:
from transformers import (
    AutoModelForCausalLM, 
    logging, 
    pipeline,
    AutoTokenizer
)

In [2]:
model = AutoModelForCausalLM.from_pretrained('outputs/opt_125m_alpaca_sft/best_model/')
tokenizer = AutoTokenizer.from_pretrained('outputs/opt_125m_alpaca_sft/best_model/')

  return self.fget.__get__(instance, owner)()


In [3]:
print(tokenizer.eos_token)

</s>


In [4]:
# logging.set_verbosity(logging.CRITICAL)

In [5]:
pipe = pipeline(
    task='text-generation', 
    model=model, 
    tokenizer=tokenizer, 
    max_length=256,
    eos_token_id=tokenizer.eos_token_id
)

In [6]:
prompt = """### Instruction:
Tell me a story where a unicorn is the hero.

### Input:


### Response:
"""

In [7]:
print(prompt)

### Instruction:
Tell me a story where a unicorn is the hero.

### Input:


### Response:



In [8]:
result = pipe(
    prompt,
    repetition_penalty=1.1
)
print(result[0]['generated_text'])

### Instruction:
Tell me a story where a unicorn is the hero.

### Input:


### Response:
A unicorn is the hero of a story about a unicorn. The story is told in the form of a poem, which is written by a unicorn. The poem is inspired by the poem and the hero's journey through the world. The hero is a unicorn who has been chosen to be the hero of the story. The hero is a unicorn who has been chosen to be the hero of the story. The hero is a unicorn who has been chosen to be the hero of the story. The hero is a unicorn who has been chosen to be the hero of the story. The hero is a unicorn who has been chosen to be the hero of the story. The hero is a unicorn who has been chosen to be the hero of the story. The hero is a unicorn who has been chosen to be the hero of the story. The hero is a unicorn who has been chosen to be the hero of the story. The hero is a unicorn who has been chosen to be the hero of the story. The hero is a unicorn who has been chosen to be the hero of the story. The