In [1]:
# Load Mistral
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import TrainingArguments
# Quantization, modelin ağırlığını küçültmek için kullanılır.
from transformers import BitsAndBytesConfig
# LoRA config
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from huggingface_hub import notebook_login
from datasets import load_dataset
from trl import SFTTrainer
import torch


In [2]:
base_model = "mistralai/Mistral-7B-v0.1"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    padding_side = "right",
    add_eos_token = True
)

# Dizinin başladığını ve bittiğini belirtmek adına, dizinin başına ve sonuna tokenler eklemek için.
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

In [6]:
# NF4 = normalize float 4 | pure FP4 , quantization çeşitleri
# Quantizationda ağıtlıklar 4 bit olarak saklanırken, hesaplamaları 16 veya 32 bit veri tipleri olarak ayarlanabilir
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant= False, #ikinci bir quantization kullanmamızı sağlar, bu da parametre başına 0.4 bitlik ek tasarrufluk sağlar
    bnb_4bit_compute_dtype = torch.bfloat16 #daha esnek, daha fazla fine tuning
)

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_4bit = True,
    quantization_config = bnb_config,
    torch_dtype = torch.bfloat16,
    device_map = "auto"
)

bin c:\Users\SERKAN\AppData\Local\Programs\Python\Python39\lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
dataset_name = "databricks/databricks-dolly-15k"

In [11]:
train_dataset = load_dataset(dataset_name, split="train[0:800]")
eval_dataset = load_dataset(dataset_name, split="train[800:1000]")

In [12]:
train_dataset

Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 800
})

In [13]:
train_dataset.to_pandas()

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,Which is a species of fish? Tope or Rope,,Tope,classification
2,Why can camels survive for long without water?,,Camels use the fat in their humps to keep them...,open_qa
3,"Alice's parents have three daughters: Amy, Jes...",,The name of the third daughter is Alice,open_qa
4,When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa
...,...,...,...,...
795,Who is the founder of the Communist Party?,,Lenin,open_qa
796,What is gardening?,Gardening is the practice of growing and culti...,Gardening is laying out and caring for a plot ...,information_extraction
797,What are your thoughts of Michael Jackson as a...,,Michael Jackson is acclaimed as the greatest p...,creative_writing
798,What is the largest pollutant?,,Carbon dioxide (CO2) - a greenhouse gas emitte...,general_qa


In [14]:
# prompt formatter

def generate_promt(sample):
    full_prompt = f"""<s>[INST]{sample['instruction']}
    {f"Here is some context: {sample['context']}" if len(sample['context'])>0 else None}
    [/INST] {sample['response']}</s>
    """

    return {"text": full_prompt}

In [15]:
train_dataset[0]

{'instruction': 'When did Virgin Australia start operating?',
 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.",
 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.',
 'category': 'closed_qa'}

In [16]:
generate_promt(train_dataset[0])

{'text': "<s>[INST]When did Virgin Australia start operating?\n    Here is some context: Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.\n    [/INST] Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.</s>\n    "}

In [17]:
generated_train_dataset = train_dataset.map(generate_promt,remove_columns=list(train_dataset.features))
generated_eval_dataset = eval_dataset.map(generate_promt,remove_columns=list(train_dataset.features))

In [18]:
generated_train_dataset[5]

{'text': "<s>[INST]If I have more pieces at the time of stalemate, have I won?\n    Here is some context: Stalemate is a situation in chess where the player whose turn it is to move is not in check and has no legal move. Stalemate results in a draw. During the endgame, stalemate is a resource that can enable the player with the inferior position to draw the game rather than lose. In more complex positions, stalemate is much rarer, usually taking the form of a swindle that succeeds only if the superior side is inattentive.[citation needed] Stalemate is also a common theme in endgame studies and other chess problems.\n\nThe outcome of a stalemate was standardized as a draw in the 19th century. Before this standardization, its treatment varied widely, including being deemed a win for the stalemating player, a half-win for that player, or a loss for that player; not being permitted; and resulting in the stalemated player missing a turn. Stalemate rules vary in other games of the chess fami

In [22]:
# gradyan kontrol noktası
# bazı büyük modellerin batch size'ları 1 alınsa bile bellek sorunu çıkarabilir.
# yapay sinir ağlarında eğitim sırasında ileri doğru giderken bir kayıp değer hesaplanır ve geriye gidildiğinde bu değer dağıtılır. İleriye doğru geçişte tüm hesaplamalar saklanırki geriye doğru gidildiğinde ağırlıklar dağıtılabilsin.
# bu saklanan değerler bellekte yer tutar. 
# gradyan kontrol noktasında sadece stratejik değerler saklanır diğerleri saklanmaz.

model.gradient_checkpointing_enable()

In [23]:
model = prepare_model_for_kbit_training(model)

In [24]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param=0

    for _,param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100*trainable_params/all_param}")

In [27]:
# low rank, eğitilecek parametrelerin sayısını ayarlar, rank yükseldikçe eğitilecek parametre sayısı artar
# alfa, öğrenilecek ağırlıklar için ölçekleme parametresidir
# target_modules, lorada uygulanacak lineer katmanlar
# drop al, regülerleşme tekniği, katmanlar arasında 0.05
# görev: diziyi tamamlama, metin üretme
lora_config = LoraConfig(
    r = 8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_propj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head"
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

In [28]:
model = get_peft_model(model, lora_config)

In [29]:
print_trainable_parameters(model)

trainable params: 19163136 || all params: 3771234304 || trainable%: 0.5081396289717246


In [30]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Linea

In [31]:

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1, #eğitim esnasında gradyanları biriktirir, daha küçük gruplar halinde iteratif olarak hesaplamasını sağlar
    optim="paged_adamw_32bit",
    save_strategy="steps",
    save_steps=25,
    learning_rate=2e-4, #öğrenme oranı
    weight_decay=0.001, #ısınma adımı : Bu parametre, modelin ağırlıklarının (weights) güncellenmesi sırasında kullanılan bir düzenleyici terimdir. Ağırlık düzenlemesi, ağırlıkların büyüklüğünü ve karmaşıklığını kontrol etmek için kullanılır.
    max_steps=20, #maksimum adım sayısı
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    report_to="none",

)

In [35]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    args = training_arguments,
    train_dataset = generated_train_dataset,
    eval_dataset = generated_eval_dataset,
    peft_config=lora_config,
    dataset_text_field="text",
)



In [36]:
# eğitim aşmasında use_catch 
model.config.use_cache = False

In [37]:
trainer.train()

  0%|          | 0/20 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'train_runtime': 4511.5095, 'train_samples_per_second': 0.018, 'train_steps_per_second': 0.004, 'train_loss': 1.5201166152954102, 'epoch': 0.1}


TrainOutput(global_step=20, training_loss=1.5201166152954102, metrics={'train_runtime': 4511.5095, 'train_samples_per_second': 0.018, 'train_steps_per_second': 0.004, 'train_loss': 1.5201166152954102, 'epoch': 0.1})

In [38]:
my_finetuned_model = "mistral-7b-mini-ft"

In [39]:
trainer.model.push_to_hub(my_finetuned_model)



adapter_model.bin:   0%|          | 0.00/601M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/serkanarslan/mistral-7b-mini-ft/commit/9b97cd38085173e87f17be3cdc8da35d5d35f9ae', commit_message='Upload model', commit_description='', oid='9b97cd38085173e87f17be3cdc8da35d5d35f9ae', pr_url=None, pr_revision=None, pr_num=None)