In [1]:
!pip install -qU datasets peft trl accelerate zstandard jsonlines

In [2]:
import sys
import logging

import datasets
from peft import LoraConfig
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset

In [3]:
training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 3e-4,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "./qwen2",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    }

peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)

In [4]:
checkpoint_path = "Qwen/Qwen2-0.5B-Instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map=None
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

In [14]:
def apply_chat_template(
    example,
    tokenizer,
):
    example["text"] = tokenizer.apply_chat_template(
        [{'role': 'user', 'content': example['instruction']+'\n'+example['input']}, {'role': 'assistant', 'content': example['output']}], tokenize=False, add_generation_prompt=False)
    return example

In [15]:
raw_dataset = load_dataset("lksy/ru_instruct_gpt4")['train'].train_test_split(test_size=0.1)
train_dataset = raw_dataset["train"]
test_dataset = raw_dataset["test"]
column_names = list(train_dataset.features)
column_names

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


['instruction', 'input', 'output', 'full_output']

In [16]:
processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train",
)

processed_test_dataset = test_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to test",
)

  self.pid = os.fork()


Applying chat template to train (num_proc=10):   0%|          | 0/13550 [00:00<?, ? examples/s]

Applying chat template to test (num_proc=10):   0%|          | 0/1506 [00:00<?, ? examples/s]

In [18]:
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_test_dataset,
    max_seq_length=2048,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=True
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.save_metrics("train", metrics)
trainer.save_state()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
PyTorch: setting up devices


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Using auto half precision backend
***** Running training *****
  Num examples = 1,112
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 278
  Number of trainable parameters = 8,798,208


Step,Training Loss
20,1.6164
40,1.49
60,1.4566
80,1.4481
100,1.42
120,1.4013
140,1.424
160,1.394
180,1.3847
200,1.3983


Saving model checkpoint to ./qwen2/checkpoint-100
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json
Model config Qwen2Config {
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 24,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

tokenizer config file saved in ./qwen2/checkpoint-100/tokenizer_config.json
Special to

In [19]:
tokenizer.padding_side = 'left'
metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 124
  Batch size = 4


In [21]:
trainer.save_model('./Qwen2-0.5B-Instruct-ru-lora')

Saving model checkpoint to ./Qwen2-0.5B-Instruct-ru-lora
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json
Model config Qwen2Config {
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 24,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

tokenizer config file saved in ./Qwen2-0.5B-Instruct-ru-lora/tokenizer_config.j

In [22]:
trainer.push_to_hub('Qwen2-0.5B-Instruct-ru-lora', token='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')

Saving model checkpoint to ./qwen2
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json
Model config Qwen2Config {
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 24,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

tokenizer config file saved in ./qwen2/tokenizer_config.json
Special tokens file saved in ./qwen2/spe

events.out.tfevents.1717784686.7c8c7a5b99aa.1602.3:   0%|          | 0.00/359 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

events.out.tfevents.1717783807.7c8c7a5b99aa.1602.1:   0%|          | 0.00/359 [00:00<?, ?B/s]

events.out.tfevents.1717779205.7c8c7a5b99aa.1602.0:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

events.out.tfevents.1717784106.7c8c7a5b99aa.1602.2:   0%|          | 0.00/8.21k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sikoraaxd/qwen2/commit/812a61a9abe87d1e7fa59ae4f0e78ea028696c29', commit_message='Qwen2-0.5B-Instruct-ru-lora', commit_description='', oid='812a61a9abe87d1e7fa59ae4f0e78ea028696c29', pr_url=None, pr_revision=None, pr_num=None)