In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# LLaMA 2 
Llama 2 is a collection of second-generation open-source LLMs from Meta that comes with a commercial license. It is designed to handle a wide range of natural language processing tasks, with models ranging in scale from 7 billion to 70 billion parameters

![LLAMA](https://images.datacamp.com/image/upload/v1697724450/Fine_Tune_L_La_MA_2_cc6aa0e4ad.png)

## Fine-tune the Llama 2 model with 7 billion parameters on a T4 GPU

In [None]:
# !pip install -U datasets trl accelerate peft bitsandbytes transformers trl huggingface_hub
%pip install -U datasets

In [None]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl

In [1]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, PeftModel
from huggingface_hub import login
import pandas as pd

2024-02-18 07:05:07.450466: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-18 07:05:07.450519: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-18 07:05:07.451996: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
from trl import SFTTrainer

In [3]:
# Model from Hugging Face hub
base_model = "NousResearch/Llama-2-7b-chat-hf"

# New instruction dataset
guanaco_dataset = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model
new_model = "llama-2-7b-chat-guanaco"

In [16]:
dataset = load_dataset(guanaco_dataset, split="train") # MultiLingual Dataset
print(dataset['text'][1])

<s>[INST] Самый великий человек из всех живших на планете? [/INST] Для начала нужно выбрать критерии величия человека. Обычно великим называют человека, который внес большой вклад в общество или сильно выделялся на фоне других в своем деле.

Например, Иосифа Бродского считают великим поэтом, а Иммануила Канта — великим философом. Александр Македонский, известный тем, что собрал в свои владения огромную империю (включавшую Македонию, Грецию, Персию, Египет), в историографии носит имя Александр Великий. Для христиан, скорее всего, самым великим человеком жившим на земле был Иисус Христос, так как он совершил множество благих деяний и совершил подвиг ради человечества. 

При этом, когда мы выдвигаем одну личность на роль великого человека, сразу же находится множество людей, не согласных с этим. Того же Иосифа Бродского, хоть он и получил престижную Нобелевскую премию, некоторые люди считают графоманом и посредственным поэтом. 

В целом, кого считать великим — это самостоятельный выбор ка

## 4-bit quantization configuration

4-bit quantization via QLoRA allows efficient finetuning of huge LLM models on consumer hardware while retaining high performance. This dramatically improves accessibility and usability for real-world applications.

QLoRA quantizes a pre-trained language model to 4 bits and freezes the parameters. A small number of trainable Low-Rank Adapter layers are then added to the model.

During fine-tuning, gradients are backpropagated through the frozen 4-bit quantized model into only the Low-Rank Adapter layers. So, the entire pretrained model remains fixed at 4 bits while only the adapters are updated. Also, the 4-bit quantization does not hurt model performance.

Here's a simplified example:

Original Weight: 0.5487 (32-bit representation)

Quantized Weight (4-bit): 0.5 (approximated for simplicity)

![](https://images.datacamp.com/image/upload/v1697713094/image7_3e12912d0d.png)


In [7]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True, # True to Make it QLORA and False for LORA
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
quant_config

BitsAndBytesConfig {
  "bnb_4bit_compute_dtype": "float16",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": false,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

### Load the LLaMA 2 Model

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



### Loading the Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

### PEFT Parameters

Parameter-Efficient Fine-Tuning (PEFT) works by only updating a small subset of the model's parameters, making it much more efficient. 

In [10]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",)

### Training parameters

In [11]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,  # Start with 1 epoch and increase gradually if memory allows
    per_device_train_batch_size=2,  # Begin with smallest batch size, increase in increments of 1
    gradient_accumulation_steps=8,  # Aggressively accumulate gradients to compensate for low batch size
    optim="adamw_torch",  # Efficient optimizer for LLMs
    save_steps=1000,  # Adjust saving frequency based on training duration
    logging_steps=1000,  # Adjust logging frequency based on your preference
    learning_rate=5e-6,  # Start with very low learning rate to mitigate instability
    weight_decay=0.01,  # Regularization to prevent overfitting
    fp16=True,  # Enable mixed precision for memory savings
    bf16=False,  # T4 doesn't support bfloat16
    max_grad_norm=0.5,  # Adjust gradient norm as needed
    max_steps=-1,  # Train for all epochs by default
    warmup_ratio=0.1,  # Adjust warmup ratio based on learning rate and dataset size
    group_by_length=True,  # Improve efficiency for long sequences
    lr_scheduler_type="constant",  # Use warmup followed by constant learning rate
    report_to="tensorboard",  # Track training progress with TensorBoard
    # Additional memory-specific optimizations:
    #max_train_steps=1000,  # Set a maximum number of training steps to limit total memory usage
    #sharded_ddp=True,  # Enable DistributedDataParallel sharding if multiple GPUs are available
    gradient_checkpointing=True,  # Recompute intermediate activations for memory savings
    fp16_full_eval=True,  # Use mixed precision during evaluation as well
    dataloader_pin_memory=False,  # Disable data pinning to avoid potential memory overhead
    local_rank=-1,  # Disable automatic distributed training (if only 1 GPU)
    #skip_memory_check=True,  # Temporarily skip memory checks, but monitor closely
)

### Model fine-tuning

Supervised fine-tuning (SFT) is a key step in reinforcement learning from human feedback (RLHF). The TRL library from HuggingFace provides an easy-to-use API to create SFT models and train them on your dataset with just a few lines of code. It comes with tools to train language models using reinforcement learning, starting with supervised fine-tuning, then reward modeling, and finally proximal policy optimization (PPO).

We will provide SFT Trainer the model, dataset, Lora configuration, tokenizer, and training parameters.

In [12]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [17]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=31, training_loss=2.116789787046371, metrics={'train_runtime': 5954.8733, 'train_samples_per_second': 0.168, 'train_steps_per_second': 0.005, 'total_flos': 2.117766449351885e+16, 'train_loss': 2.116789787046371, 'epoch': 0.99})

In [None]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

In [None]:
from tensorboard import notebook
log_dir = "results/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))

In [13]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Who is Leonardo Da Vinci?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] Who is Leonardo Da Vinci? [/INST]  Leonardo da Vinci (1452-1519) was a true Renaissance man, a polymath who excelled in various fields, including art, science, engineering, mathematics, and anatomy. everybody knows him as the most famous artist of the Italian Renaissance, but he was also a prolific inventor, engineer, and scientist. Here are some key facts about Leonardo da Vinci:

1. Early Life: Leonardo was born in Vinci, Italy, on April 15, 1452. His father, Messer Piero Fruosini, was a notary, and his mother, Caterina Buti, was a peasant.
2. Artistic Career: Leonardo began his artistic career as a young man in Florence, where he was apprenticed to the artist Andrea del Ver


In [18]:
prompt = "What is Data Science Career?"
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] What is Data Science Career? [/INST]  Data science is a field that combines mathematical and computational techniques to extract insights and knowledge from data. everybody in today's world is surrounded by data, from social media platforms to wearable devices, and data science professionals are in high demand to analyze and make sense of this data.

A data science career typically involves working with large datasets to identify patterns, trends, and relationships, and using this information to inform business decisions or solve complex problems. Data scientists use a variety of tools and techniques, including machine learning algorithms, statistical modeling, and data visualization, to uncover insights and create value from data.

Some common roles within the field of data science include:

1. Data Scientist: A data scientist is responsible for collecting, analyzing, and interpreting large datasets to extract insights and make recommendations. They may work in a variety


In [None]:
!huggingface-cli login

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)