In [None]:
!pip install transformers
!pip install peft
!pip install torch

Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.27.2 peft-0.9.0


In [None]:
from dataclasses import dataclass, field
from typing import Dict, List, Optional
import transformers
import torch
from torch.utils.data import Dataset
from transformers.trainer_pt_utils import LabelSmoother

IGNORE_TOKEN = LabelSmoother.ignore_index

In [None]:
@dataclass
class ModelArguments:
  model_name_or_path: Optional[str] = field(default="Qwen/Qwen1.5-0.5B")

@dataclass
class DataArguments:
  train_path: str = field(default=None, metadata={"help": "Path to training dataset"})
  test_path: str = field(default=None, metadata={"help": "Path to test dataset"})
  valid_path: str = field(default=None, metadata={"help": "Path to validation dataset"})

@dataclass
class TrainingArguments(transformers.TrainingArguments):
  cache_dir: Optional[str] = field(default=None)
  optim: str = field(default=None)
  use_lora: bool = False
  model_max_length: int = field(default=32768, metadata={"help": "Max sequence length"})

@dataclass
class LoraArguments:
  lora_r: int = 64
  lora_alpha: int = 16
  lora_dropout: int = 0.05
  lora_target_modules: List[str] = field(default_factory=lambda: ["c_attn", "c_proj", "w1", "w2"])
  lora_weight_path: str = ""
  lora_bias: str = "none"
  q_lora: bool = False

In [None]:
# model args
model_name_or_path: Optional[str] = "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4"

# data agrs
train_path: str = "./train.jsonl"
test_path: str = "./test.jsonl"
valid_path: str = "./valid.jsonl"

# training args
output_dir = "./results"

num_train_epochs = 5
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 16

evaluation_strategy = "no"
save_strategy = "steps"
save_steps = 1000
save_total_limit = 10

learning_rate = 1e-5
weight_decay = 0.1
adam_beta2 = 0.95
warmup_ratio = 0.01
lr_scheduler_type = "cosine"

logging_steps = 1
report_to = "none"
model_max_length = 512

# LORA args
lora_r: int = 64
lora_alpha: int = 16
lora_dropout: int = 0.05
lora_target_modules: List[str] = ["c_attn", "c_proj", "w1", "w2"]
lora_weight_path: str = ""
lora_bias: str = "none"
q_lora: bool = False

In [None]:
def preprocess(sources, tokenizer: transformers.PreTrainedTokenizer, max_len: int, system_message: str = "You are a helpful assistant.") -> Dict:
  roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
  im_start = tokenizer.im_start_id
  im_end = tokenizer.im_end_id
  new_line_token = tokenizer('\n').input_ids
  _system = tokenizer('system').input_ids + new_line_token
  _assistant = tokenizer('assistant').input_ids + new_line_token
  _user = tokenizer('user').input_ids + new_line_token

  input_ids, targets = [], []
  for i, source in enumerate(sources):
    if roles[source[0]["from"]] != roles["user"]:
      source = source[1:]

    input_id, target = [], []
    system = [im_start] + _system + tokenizer(system_message) + [im_end] + new_line_token
    input_id += system
    target += [im_start] + [IGNORE_TOKEN] * (len(system)-3) + [im_end] + new_line_token
    assert len(input_id) == len(target)
    for j, sentence in enumerate(source):
      role = roles[sentence["from"]]
      _input_id = tokenizer(role).input_ids + new_line_token + tokenizer(sentence["value"]).input_ids + [im_end] + new_line_token
      input_id += _input_id
      if role == "<|im_start|>user":
        _target = [im_start] + [IGNORE_TOKEN] * (len(_input_id)-3) + [im_end] + new_line_token
      elif role ==  "<|im_start|>assistant":
        _target = [im_start] + [IGNORE_TOKEN] * len(tokenizer(role).input_ids) + _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + new_line_token
      else:
        raise NotImplementedError
      target += _target
    assert len(input_id) == len(target)
    input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
    target += [IGNORE_TOKEN] * (max_len - len(target))
    input_ids.append(input_id[:max_len])
    targets.append(target[:max_len])
  input_ids = torch.tensor(input_ids, dtype=torch.int)
  targets = torch.tensor(targets, dtype=torch.int)

  return dict(input_ids=input_ids, labels=targets, attention_mask=input_ids.ne(tokenizer.pad_token_id))

In [None]:
class SupervisedDataset(Dataset):
  def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
    super(SupervisedDataset, self).__init__()

    sources = [example["conversations"] for example in raw_data]
    data_dict = preprocess(sources, tokenizer, max_len)

    self.input_ids = data_dict["input_ids"]
    self.labels = data_dict["labels"]
    self.attention_mask = data_dict["attention_mask"]

    def __len__(self):
      return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
      return dict(input_ids=self.input_ids[i], lables= self.labels[i], attentionMask=self.attention_mask[i])


In [None]:
import json
import os

In [None]:
def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args, max_len) -> Dict:
  train_json = json.load(open(data_args.train_path, "r"))
  train_dataset = SupervisedDataset(train_json, tokenizer=tokenizer, max_len=max_len)

  if data_args.test_path:
    test_json = json.load(open(data_args.test_path, "r"))
    test_dataset = SupervisedDataset(test_json, tokenizer=tokenizer, max_len=max_len)
  else:
    test_dataset = None

  return dict(train_dataset=train_dataset, test_dataset=test_dataset)

In [None]:
from transformers import Trainer, GPTQConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
def train():
  global local_rank

  parser = transformers.HfArgumentParser(
      (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
  )

  (
      model_args,
      data_args,
      training_args,
      lora_args
  ) = parser.parse_args_into_dataclasses()

  local_rank = training_args.local_rank

  device_map = "auto"
  # world_size = int(os.environ.get("WORLD_SIZE", 1))
  # ddp = world_size != 1
  # if lora_args.q_lora:
  #   device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto"

  # chat model, skipping other setups
  is_chat_model = True

  config = transformers.AutoConfig.from_pretrained(
      model_args.model_name_or_path,
      cache_dir=training_args.cache_dir,
  )

  config.use_cache = False

  model = transformers.AutoModelForCausalLM.from_pretrained(
      model_args.model_name_or_path,
      config=config,
      device_map="auto",
        quantization_config=GPTQConfig(
            bits=4, disable_exllama=True
        )
      if training_args.use_lora and lora_args.q_lora
      else None,
      low_cpu_mem_usage=False,
  )

  tokenizer = transformers.AutoTokenizer.from_pretrained(
      model_args.model_name_or_path,
      cache_dir=training_args.cache_dir,
      model_max_length=training_args.model_max_length,
      padding_side="right",
      use_fast=False
  )
  tokenizer.pad_token_id = tokenizer.eod_id

  if training_args.use_lora:
    if lora_args.q_lora or is_chat_model:
      modules_to_save = None
    else:
      modules_to_save = ["wte", "lm_head"]
    lora_config = LoraConfig(
        r=lora_args.lora_r,
        lora_alpha=lora_args.lora_alpha,
        target_modules=lora_args.lora_target_modules,
        lora_dropout=lora_args.lora_dropout,
        bias=lora_args.lora_bias,
        task_type="CAUSAL_LM",
        modules_to_save=modules_to_save  # This argument serves for adding new tokens.
    )
    if lora_args.q_lora:
      model = prepare_model_for_kbit_training(
            model, use_gradient_checkpointing=training_args.gradient_checkpointing
      )

    model = get_peft_model(model, lora_config)

    # Print peft trainable params
    model.print_trainable_parameters()

    if training_args.gradient_checkpointing:
      model.enable_input_require_grads()

    # Load data
    data_module = make_supervised_data_module(
        tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length
    )

    # Start trainner
    trainer = Trainer(
        model=model, tokenizer=tokenizer, args=training_args, **data_module
    )

    trainer.train()
    trainer.save_state()