# Introduction

* Datasets:
    * https://huggingface.co/datasets/flpelerin/ChatAlpaca-10k
* Models:
    * https://huggingface.co/facebook/opt-350m
 
***Note:*** *We train a OPT 350M model using a custom chat template. OPT 350M does not contain a chat template by default.*

In [1]:
!pip install -U accelerate peft bitsandbytes transformers trl datasets

Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting transformers
  Downloading transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.8.5-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.3-py3-none-any.whl.metadata (7.9 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading peft-0.10.0-py3-none-any.whl (199 kB)


In [2]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

2024-04-19 17:16:56.003609: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-19 17:16:56.003713: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-19 17:16:56.137386: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Configuration

In [3]:
batch_size = 1
num_workers = os.cpu_count()
# max_steps = -1 for epoch-wise training.
# epochs = -1 for step-wise training.
# Both cannot be -1.
max_steps = -1
epochs = 5
bf16 = False
fp16 = True
gradient_accumulation_steps = 32
seq_length = 1024
logging_steps = 50
save_steps = 50
learning_rate = 0.0002
model_name = 'facebook/opt-350m'
out_dir = 'outputs/opt_350m_chat_alpaca'
seed = 42

## Load Dataset

In [4]:
dataset = load_dataset('flpelerin/ChatAlpaca-10k')
# dataset = load_dataset('robinsmits/ChatAlpaca-20K')

Downloading readme:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'conversations'],
        num_rows: 10000
    })
})


In [6]:
print(dataset['train']['conversations'][0])

[{'from': 'human', 'value': 'Find the product of the numbers: 5 and 8'}, {'from': 'gpt', 'value': 'The product of 5 and 8 is 40.'}, {'from': 'human', 'value': 'What is the sum of the numbers 6 and 12?'}, {'from': 'gpt', 'value': 'The sum of the numbers 6 and 12 is 18.'}, {'from': 'human', 'value': 'Can you tell me the quotient of 20 and 5?'}, {'from': 'gpt', 'value': 'Yes, the quotient of 20 and 5 is 4.'}, {'from': 'human', 'value': 'What is the difference between 25 and 13?'}, {'from': 'gpt', 'value': 'The difference between 25 and 13 is 12.'}, {'from': 'human', 'value': 'What is the square of 9?'}, {'from': 'gpt', 'value': 'The square of 9 is 81.'}, {'from': 'human', 'value': 'What is the cube of 6?'}, {'from': 'gpt', 'value': 'The cube of 6 is 216.'}]


In [7]:
print(dataset['train'][0])

{'id': '0', 'conversations': [{'from': 'human', 'value': 'Find the product of the numbers: 5 and 8'}, {'from': 'gpt', 'value': 'The product of 5 and 8 is 40.'}, {'from': 'human', 'value': 'What is the sum of the numbers 6 and 12?'}, {'from': 'gpt', 'value': 'The sum of the numbers 6 and 12 is 18.'}, {'from': 'human', 'value': 'Can you tell me the quotient of 20 and 5?'}, {'from': 'gpt', 'value': 'Yes, the quotient of 20 and 5 is 4.'}, {'from': 'human', 'value': 'What is the difference between 25 and 13?'}, {'from': 'gpt', 'value': 'The difference between 25 and 13 is 12.'}, {'from': 'human', 'value': 'What is the square of 9?'}, {'from': 'gpt', 'value': 'The square of 9 is 81.'}, {'from': 'human', 'value': 'What is the cube of 6?'}, {'from': 'gpt', 'value': 'The cube of 6 is 216.'}]}


In [8]:
print(type(dataset['train']['conversations']))

<class 'list'>


In [9]:
full_dataset = dataset['train'].train_test_split(test_size=0.05, shuffle=True, seed=seed)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']
 
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['id', 'conversations'],
    num_rows: 9500
})
Dataset({
    features: ['id', 'conversations'],
    num_rows: 500
})


In [10]:
# Prepare data with chat template.
chat_dataset_train = Dataset.from_dict({
    'chat': [x for x in dataset_train['conversations']]
})
chat_dataset_valid = Dataset.from_dict({
    'chat': [x for x in dataset_valid['conversations']]
})

In [11]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
    use_fast=False
)

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [12]:
print(tokenizer.pad_token, tokenizer.eos_token)

<pad> </s>


In [13]:
tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['from'] == 'human') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['from'] == 'human' %}{{ '[INST] ' + message['value'] + ' [/INST]' }}{% elif message['from'] == 'gpt' %}{{ message['value'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"

In [14]:
chat_dataset_train = chat_dataset_train.map(
    lambda x: {'formatted_chat': tokenizer.apply_chat_template(
        x['chat'], tokenize=False, add_generation_prompt=False
    )}
)

chat_dataset_valid = chat_dataset_valid.map(
    lambda x: {'formatted_chat': tokenizer.apply_chat_template(
        x['chat'], tokenize=False, add_generation_prompt=False
    )}
)

Map:   0%|          | 0/9500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [15]:
print(chat_dataset_train['formatted_chat'][0])

</s>[INST] To what type of sentence does the word "once" belong? [/INST]The word 'once' is an adverb and often appears at the beginning of an adverbial clause indicating a single occurrence in the past.</s> [INST] What are some other adverbs that can appear at the beginning of an adverbial clause? [/INST]Here are some other adverbs that can appear at the beginning of an adverbial clause:

1. After
2. Although
3. As
4. Because
5. Before
6. If
7. Since
8. Than
9. Though
10. Until
11. When
12. Whenever
13. Where
14. While

These are just a few examples, there are many other adverbs that can be used to introduce an adverbial clause.</s> [INST] Can you give me an example of a sentence that uses "if" as an adverb at the beginning of an adverbial clause? [/INST]Sure, here's an example sentence that uses "if" as an adverb at the beginning of an adverbial clause:

"If you don't hurry, you'll miss your train."

In this sentence, the adverbial clause "if you don't hurry" is introduced by the adve

## Model

In [16]:
if bf16:
    model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [17]:
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

## Training

In [18]:
if max_steps == -1 and epochs > 0:
    training_args = TrainingArguments(
        output_dir=f"{out_dir}/logs",
        evaluation_strategy='epoch',
        weight_decay=0.01,
        load_best_model_at_end=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_strategy='steps',
        save_strategy='epoch',
        logging_steps=logging_steps,
        num_train_epochs=epochs,
        save_total_limit=2,
        bf16=bf16,
        fp16=fp16,
        report_to='tensorboard',
        dataloader_num_workers=num_workers,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        lr_scheduler_type='constant',
        seed=seed
    )

if max_steps > 0 and epochs == -1:
    training_args = TrainingArguments(
        output_dir=f"{out_dir}/logs",
        evaluation_strategy='steps',
        weight_decay=0.01,
        load_best_model_at_end=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_strategy='steps',
        save_strategy='steps',
        logging_steps=logging_steps,
        save_steps=save_steps,
        save_total_limit=2,
        bf16=bf16,
        fp16=fp16,
        report_to='tensorboard',
        max_steps=max_steps,
        dataloader_num_workers=num_workers,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        lr_scheduler_type='constant',
        seed=seed
    )

In [19]:
trainer = SFTTrainer(
    model=model,
    train_dataset=chat_dataset_train,
    eval_dataset=chat_dataset_valid,
    max_seq_length=seq_length,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
    dataset_text_field='formatted_chat'
)

Map:   0%|          | 0/9500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [20]:
dataloader = trainer.get_train_dataloader()
for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break

</s></s>[INST] Identify 5 ethical concerns that using a GPT-3 driven chatbot raises. [/INST]1. Privacy: GPT-3 driven chatbots might collect user data without the knowledge or consent of users.
2. Bias: GPT-3 models are trained on language datasets that may contain biased information.
3. Accuracy: GPT-3 chatbots may not always provide accurate responses.
4. Misleading: GPT-3 chatbots could be used to spread false information or to deceive users.
5. Legal: GPT-3 driven chatbots could violate laws and regulations if they are used in areas where legal compliance is required.</s> [INST] Can you give an example of how GPT-3 chatbots can violate laws and regulations? [/INST]Yes, here's an example:

Let's say a company uses a GPT-3 chatbot to interact with customers regarding their personal financial information, such as bank account details, credit card numbers, and investments. However, if the chatbot fails to meet industry-specific regulations and data protection laws, it could expose users

In [21]:
history = trainer.train()

Epoch,Training Loss,Validation Loss
0,1.8202,1.719334
1,1.5679,1.702485
2,1.3752,1.712733
3,1.1998,1.758315
4,1.084,1.801564


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [22]:
model.save_pretrained(f"{out_dir}/best_model")
tokenizer.save_pretrained(f"{out_dir}/best_model")

('outputs/opt_350m_chat_alpaca/best_model/tokenizer_config.json',
 'outputs/opt_350m_chat_alpaca/best_model/special_tokens_map.json',
 'outputs/opt_350m_chat_alpaca/best_model/vocab.json',
 'outputs/opt_350m_chat_alpaca/best_model/merges.txt',
 'outputs/opt_350m_chat_alpaca/best_model/added_tokens.json')

## Inference

In [23]:
from transformers import (
    AutoModelForCausalLM, 
    logging, 
    pipeline,
    AutoTokenizer
)

In [24]:
model = AutoModelForCausalLM.from_pretrained('outputs/opt_350m_chat_alpaca/best_model/')
tokenizer = AutoTokenizer.from_pretrained('outputs/opt_350m_chat_alpaca/best_model/')

In [25]:
print(tokenizer.eos_token)

</s>


In [26]:
# logging.set_verbosity(logging.CRITICAL)

In [27]:
pipe = pipeline(
    task='text-generation', 
    model=model, 
    tokenizer=tokenizer, 
    max_length=256,
    eos_token_id=tokenizer.eos_token_id
)

In [28]:
prompt = """[INST]Hello. Who are you?[/INST]
"""

In [29]:
print(prompt)

[INST]Hello. Who are you?[/INST]



In [30]:
result = pipe(
    prompt,
    repetition_penalty=1.1
)
print(result[0]['generated_text'])

[INST]Hello. Who are you?[/INST]
I am a software engineer.


In [31]:
prompt = """[INST]Write Python code for merge sort.[/INST]
"""

result = pipe(
    prompt,
    repetition_penalty=1.1
)
print(result[0]['generated_text'])

[INST]Write Python code for merge sort.[/INST]

for i in range(1, 10):
     print("i is in range(1, 10)) [/INST]for i in range(1, 10):
     print("i is in range(1, 10))


In [32]:
!zip -r outputs outputs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: outputs/ (stored 0%)
  adding: outputs/opt_350m_chat_alpaca/ (stored 0%)
  adding: outputs/opt_350m_chat_alpaca/logs/ (stored 0%)
  adding: outputs/opt_350m_chat_alpaca/logs/runs/ (stored 0%)
  adding: outputs/opt_350m_chat_alpaca/logs/runs/Apr19_17-17-27_f65483054bf9/ (stored 0%)
  adding: outputs/opt_350m_chat_alpaca/logs/runs/Apr19_17-17-27_f65483054bf9/events.out.tfevents.1713547127.f65483054bf9.24.0 (deflated 64%)
  adding: outputs/opt_350m_chat_alpaca/logs/checkpoint-1480/ (stored 0%)
  adding: outputs/opt_350m_chat_alpaca/logs/checkpoint-1480/generation_config.json (deflated 30%)
  adding: outputs/opt_350m_chat_alpaca/logs/checkpoint-1480/vocab.json (deflated 68%)
  adding: outputs/opt_350m_chat_alpaca/logs/checkpoint-1480/rng_state.pth (deflated 25%)
  adding: outputs/opt_350m_chat_alpaca/logs/checkpoint-1480/optimizer.pt (deflated 9%)
  adding: outputs/opt_350m_chat_alpaca/logs/checkpoint-1480/scheduler.pt (deflated 57%)
  adding: outputs/opt_350m_chat_al