#Fine-Tuning LLaMA for Sentiment Analysis

This notebook demonstrates how to fine-tune a LLaMA model for sentiment analysis using the Hugging Face Transformers library and the PEFT method.

In [1]:
# Install required packages
!pip install -qU transformers peft trl accelerate bitsandbytes datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m76.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.1/280.1 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.3/324.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import pandas as pd
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from peft import LoraConfig, AutoPeftModelForCausalLM
from trl import SFTTrainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


 ## Configuration,

Set up the necessary configuration parameters for the fine-tuning proce

In [4]:
TASK = "cls_headline"
VERSION = "v3"
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
DATASET_ID = "Sorour/finred"
# TEST_DATASET_ID = "Sorour/sentiment-test-fpb"
SAVE_MODEL_TO = f"Sorour/phi3_{TASK}_{VERSION}"
OUTPUT_DIR = f"./{TASK}_phi3_{VERSION}"

## Load Datasets
Load the sentiment analysis training and test datasets from Hugging Face.

In [5]:
df = load_dataset(DATASET_ID)
print("Training dataset:")
print(df)

df_test = pd.DataFrame(df['test'])
print("\\nTest dataset:"),
print(df_test.info())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/582 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/239k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6463 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/1616 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1488 [00:00<?, ? examples/s]

Training dataset:
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'instruction', 'type'],
        num_rows: 6463
    })
    valid: Dataset({
        features: ['text', 'label', 'instruction', 'type'],
        num_rows: 1616
    })
    test: Dataset({
        features: ['text', 'label', 'instruction', 'type'],
        num_rows: 1488
    })
})
\nTest dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1488 entries, 0 to 1487
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   text         1488 non-null   object
 1   label        1488 non-null   object
 2   instruction  1488 non-null   object
 3   type         1488 non-null   object
dtypes: object(4)
memory usage: 46.6+ KB
None


## Prepare Model and Tokenizer

Load the pre-trained LLaMA model and tokenizer, and configure them for fine-tuning.

In [6]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
)

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

## Define Instruction Templates and Utility Functions
Create templates for formatting the input data and responses, and define utility functions for inference and evaluation.

In [9]:
INSTRUCTION_PROMPT_TEMPLATE = """\
<|user|> {instruction}

sentence: {Context} <|end|><|assistant|>"""

RESPONSE_TEMPLATE = """\
{answer}<|end|>"""

In [10]:
def create_instruction(sample, return_response=True):
  prompt = INSTRUCTION_PROMPT_TEMPLATE.format(instruction=sample['instruction'],Context=sample['text'])

  if return_response:
    prompt += RESPONSE_TEMPLATE.format(answer=sample["label"])

  return prompt

In [11]:
def get_inference_result(df, model_pipe, output_name, params):
  outputs_list = []
  for idx, row in df.iterrows():
      result = model_pipe(create_instruction(row, return_response=False), do_sample=params['do_sample'],
                          max_new_tokens=params['max_new_tokens'], temperature=params['temperature'], top_k=params['top_k'])
      outputs = result[0]["generated_text"]
      outputs_list.append(outputs)
  df[output_name] = outputs_list
  return df

def evaluate(y_pred, y_true):
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred, average='weighted')
  recall = recall_score(y_true, y_pred, average='weighted')
  f1 = f1_score(y_true, y_pred, average='weighted')

  print(f"Accuracy: {accuracy:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall: {recall:.4f}")
  print(f"F1 Score: {f1:.4f}")

  return accuracy, precision, recall, f1

## Evaluate Base Model
Set up an inference pipeline using the base model and evaluate its performance on the test set.

In [12]:
base_model_pipe = pipeline("text-generation", model, tokenizer=tokenizer, max_new_tokens=256, return_full_text=False)
params = {
    "do_sample": True,
    "max_new_tokens": 500,
    "temperature": 0.1,
    "top_k": 50,
}

df_test = get_inference_result(df_test, base_model_pipe, output_name="base_model_predict", params=params)

# Clean up predictions
df_test['base_model_predict'] = df_test['base_model_predict'].str.strip().str.lower()

print("Base Model Evaluation:"),
evaluate(df_test['base_model_predict'], df_test['label']),
# Display a few examples
print("\\nSample Predictions:"),
print(df_test[['text', 'label', 'base_model_predict']].head())

You are not running the flash-attention implementation, expect numerical differences.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Base Model Evaluation:
Accuracy: 0.1095
Precision: 0.2545
Recall: 0.1095
F1 Score: 0.0764
\nSample Predictions:
                                                text  \
0  Wednesday, July 8, 2015 10:30AM IST (5:00AM GM...   
1  The Daily Show with Trevor Noah premieres toni...   
2  "Our results for the quarter show very balance...   
3  Saudi Arabian budget carrier flynas, which mad...   
4  First Eagle is currently owned by members of t...   

                       label base_model_predict  
0                 subsidiary         subsidiary  
1                   owned by         subsidiary  
2                   employer                ceo  
3  product/material produced       manufacturer  
4                   industry           owned by  


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Fine-tuning Configuration,
Set up the PEFT configuration and training arguments for fine-tuning.

In [13]:
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    bias="none",
    lora_dropout=0.05,
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
    task_type="CAUSAL_LM",
)

In [14]:
from trl import SFTConfig
# TrainingArguments
args = SFTConfig(
    output_dir=OUTPUT_DIR,
    eval_strategy="steps",
    eval_steps=20,
    # max_steps=60,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    save_strategy="epoch",
    learning_rate=2e-4,
    logging_steps=10,
    fp16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    push_to_hub=True,
)

 ## Initialize and Run Trainer,
Set up the SFTTrainer and start the fine-tuning process.

In [15]:
# SFTTrainer
max_seq_length=1024
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=df['train'],
    eval_dataset=df['valid'],
    # formatting_func= lambda sample: create_instruction(sample, return_response=True),
    formatting_func=create_instruction,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)
trainer.train()



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
20,0.9305,0.858648
40,0.8342,0.796556
60,0.7516,0.75284
80,0.7614,0.722771
100,0.7154,0.697254
120,0.7157,0.67695
140,0.6882,0.664203
160,0.6453,0.647545
180,0.6539,0.637033
200,0.6236,0.625612


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=382, training_loss=0.6699435483098655, metrics={'train_runtime': 1824.9817, 'train_samples_per_second': 1.676, 'train_steps_per_second': 0.209, 'total_flos': 7.020709766234112e+16, 'train_loss': 0.6699435483098655, 'epoch': 1.9973856209150327})

## Save Fine-tuned Model
Save the fine-tuned model for later use.

In [16]:
fine_tuned_model = AutoPeftModelForCausalLM.from_pretrained(
  OUTPUT_DIR,
  torch_dtype=torch.float16,
)
merged_model = fine_tuned_model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
device = 0 if torch.cuda.is_available() else -1
ft_pipe = pipeline("text-generation", merged_model, tokenizer=tokenizer, max_new_tokens=256, return_full_text=False, device=device)

## Evaluate Fine-tuned Model
Evaluate the performance of the fine-tuned model on the test set.

In [30]:
df_test = get_inference_result(df_test, ft_pipe, output_name="fine_tuned_predict", params=params)
df_test['fine_tuned_predict'] = df_test['fine_tuned_predict'].str.strip().str.lower()
print("Fine-Tuned Model Evaluation:")
evaluate(df_test['fine_tuned_predict'], df_test['label'])
# Display a few examples
print("\\nSample Predictions:"),
print(df_test[['text', 'label', 'fine_tuned_predict']].head())

Fine-Tuned Model Evaluation:
Accuracy: 0.5914
Precision: 0.6117
Recall: 0.5914
F1 Score: 0.5464
\nSample Predictions:
                                                text  \
0  Wednesday, July 8, 2015 10:30AM IST (5:00AM GM...   
1  The Daily Show with Trevor Noah premieres toni...   
2  "Our results for the quarter show very balance...   
3  Saudi Arabian budget carrier flynas, which mad...   
4  First Eagle is currently owned by members of t...   

                       label         fine_tuned_predict  
0                 subsidiary        parent organization  
1                   owned by        parent organization  
2                   employer                   employer  
3  product/material produced  product/material produced  
4                   industry                   industry  


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# for name, param in merged_model.named_parameters():
#     print(f"Layer: {name} | Size: {param.size()} | Number of parameters: {param.numel()}")