<a href="https://colab.research.google.com/github/slv-ai/Fine-tune-LLMs/blob/main/Fine_tuning_an_LLM_using_LORA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers[torch,accelerate]>=0.21.0

In [None]:
!pip install evaluate



In [None]:
!pip install PEFT



In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset,DatasetDict,Dataset
import pandas as pd
import numpy as np
import torch
from transformers import(
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments
    )
from peft import PeftModel,PeftConfig,get_peft_model,LoraConfig
import evaluate

Dataset

In [None]:
imdb_dataset=load_dataset('imdb')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
N=1000
rand_index=np.random.randint(24999,size=N)
train_dataset=imdb_dataset['train'].select(rand_index)
val_dataset=imdb_dataset['test'].select(rand_index)

In [None]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})

In [None]:
val_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})

In [None]:
#% of training data with label 1
np.array(train_dataset['label']).sum()/len(train_dataset['label'])

0.513

Model

In [None]:
model_checkpoint='distilbert-base-uncased'

id2label={0:"Negative",1:"Positive"}
label2id={v:k for k,v in id2label.items()}

model=AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=2,id2label=id2label,label2id=label2id)
model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

#preprocess data

In [None]:
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)

if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token':'[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

In [None]:
#tokenizer function
def tokenize_function(data):
  text=data['text']
  tokenizer.truncation_side="left"
  tokenized_inputs=tokenizer(
      text,
      return_tensors='np',
      truncation=True,
      max_length=512
      )
  return tokenized_inputs


In [None]:
train_dataset_tokenized=train_dataset.map(tokenize_function,batched=True)
val_dataset_tokenized=val_dataset.map(tokenize_function,batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
#create datacollator
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

evaluation

In [None]:
accuracy=evaluate.load('accuracy')

In [None]:
def compute_metrics(eval_pred):
  predictions,labels=eval_pred
  predictions=np.argmax(predictions,axis=1)
  return {"accuracy":accuracy.compute(predictions=predictions,references=labels)}

Train model

In [None]:
peft_config=LoraConfig(task_type="SEQ_CLS",
                       r=4,
                       lora_alpha=32,
                       lora_dropout=0.01,
                       target_modules=['q_lin'])

In [None]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False)

In [None]:
model=get_peft_model(model,peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [None]:
#hyperparameters
lr=1e-3
batch_size=4
num_epochs=10

In [None]:
from transformers import TrainingArguments,Trainer

In [None]:
#define training arguments
training_args = TrainingArguments(
    output_dir=model_checkpoint+"-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
#create trainer object
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=val_dataset_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
#train model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.48641,{'accuracy': 0.849}
2,0.437700,0.373577,{'accuracy': 0.871}
3,0.437700,0.579283,{'accuracy': 0.87}
4,0.204900,0.77192,{'accuracy': 0.874}
5,0.204900,0.967454,{'accuracy': 0.875}
6,0.066600,1.08104,{'accuracy': 0.876}
7,0.066600,1.069961,{'accuracy': 0.876}
8,0.025700,1.08894,{'accuracy': 0.874}
9,0.025700,1.144624,{'accuracy': 0.873}
10,0.006200,1.144343,{'accuracy': 0.875}


Trainer is attempting to log a value of "{'accuracy': 0.849}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.871}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.87}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.874}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.875}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This in

TrainOutput(global_step=2500, training_loss=0.14822767276763915, metrics={'train_runtime': 533.8971, 'train_samples_per_second': 18.73, 'train_steps_per_second': 4.683, 'total_flos': 1145258422840800.0, 'train_loss': 0.14822767276763915, 'epoch': 10.0})

predictons

In [None]:
text_list=["it was good.","not a fan,don't recommend"]

In [None]:
model.to("cpu")

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.01, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

In [None]:
print("model predictions:")
for text in text_list:
  inputs=tokenizer.encode(text,return_tensors='pt').to("cpu")
  logits=model(inputs).logits
  predictions=torch.max(logits,1).indices
  print(text,"-",id2label[predictions.tolist()[0]])

model predictions:
it was good. - Positive
not a fan,don't recommend - Negative
