In [1]:
import os, torch, wandb

import transformers

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)

import datasets
from trl import SFTTrainer, setup_chat_format

import json




In [2]:
torch.cuda.is_available()

True

# Unloading required utils

In [3]:
dataset_path = r"C:\Users\vital\PycharmProjects\Pozdnyakov-Vlad-AI\dataset\text_dataset.json"
with open(dataset_path, "r", encoding="utf_8_sig") as json_file:
    dict_dataset = json.load(json_file)

In [4]:
model_name = "t-bank-ai/T-lite-instruct-0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto",
    quantization_config=bnb_config,
    attn_implementation="eager"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
adaptated_dataset = [
    tokenizer.apply_chat_template(current_sample, tokenize=False) 
    for current_sample in dict_dataset
]
column_dataset = {"text": adaptated_dataset[20:]}
val_column_dataset = {"text": adaptated_dataset[:20]}
dataset = datasets.Dataset.from_dict(column_dataset)
val_dataset = datasets.Dataset.from_dict(val_column_dataset)

In [6]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

def generate(prompt: str) -> str:
    messages = [
        {"role": "user", "content": prompt}
    ]
    
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=False,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
     input_ids,
     max_new_tokens=96,
     eos_token_id=terminators
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)
    

# Getting Training utils

In [7]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [8]:
training_arguments = TrainingArguments(
    output_dir="Pozdnyakov-Vlad-AI-Cygan",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=5,
    eval_strategy="steps",
    eval_steps=0.1,
    warmup_steps=10,
    learning_rate=1e-5,
    fp16=True,
    group_by_length=True,
    save_steps=50
)

In [9]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    max_seq_length=128,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1479 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [10]:
trainer.train()

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: vitaliy-petreev (vitaliy-petreev-extremum-project-moscow). Use `wandb login --relogin` to force relogin


Step,Training Loss,Validation Loss
370,No log,2.567864
740,2.641700,2.471879
1110,2.284000,2.389934
1480,2.284000,2.367143
1850,2.121200,2.358264
2220,1.975500,2.349215
2590,1.913700,2.388282
2960,1.913700,2.386508
3330,1.831600,2.425065


TrainOutput(global_step=3695, training_loss=2.055538349900097, metrics={'train_runtime': 3975.4199, 'train_samples_per_second': 1.86, 'train_steps_per_second': 0.929, 'total_flos': 1.888175038464e+16, 'train_loss': 2.055538349900097, 'epoch': 4.996619337390128})