Fine Tuning vs RAG approaches
Fabio Cardoso - March / 2024

Using Parameter-efficient fine-tuning (PEFT), which updates only a small set of parameters. Specifically, using QLoRA, a PEFT technique for quantized large language models.

In [None]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl 
%pip uninstall datasets --yes
%pip install -U datasets
%pip install trl
%autoreload

In [None]:
# Imports 

import os
import json
import datasets
import pandas as pd
from datasets import load_dataset
import xml.etree.ElementTree as ET

In [None]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model

In [None]:
from transformers import BitsAndBytesConfig

In [None]:
from transformers import AutoModelForCausalLM

In [None]:
from transformers import AutoTokenizer

In [None]:
from transformers import TrainingArguments

In [None]:
import torch

In [None]:
# Paths

base_model = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"
fine_tuned  = "/kaggle/input/texts-for-llm-fine-tuning-and-rag/checkpoint-360/checkpoint-360/"

In [None]:
# Mass for Fine Tunning (input/output)

if False:

    tree = ET.parse("/kaggle/input/texts-for-llm-fine-tuning-and-rag/Zephyr Seraphina QnA for FT.txt")
    root = tree.getroot()

    questions = []
    answers = []

    for child in root:
        questions.append(child.tag)
        answers.append(child.text)

    df_text_4ft = pd.DataFrame(questions, columns=['input'])
    df_text_4ft['output'] = answers
    df_text_4ft.to_json('texts_4ft.json', orient='records')
    dataset = datasets.Dataset.from_json("/kaggle/working/texts_4ft.json")
    print(len(questions))
    print(df_text_4ft[:3])

In [None]:
# Mass for RAG (sections)
if True:
    
    f = open("/kaggle/input/texts-for-llm-fine-tuning-and-rag/Zephyr Seraphina Sections 4 RAG.txt", mode='r', encoding='utf8')
    section = ""
    sections = []
    line = f.readline()
    while line:
        if line.startswith("@Section"):
            if section != "":
                sections.append(section)
            section = ""
        section += line
        line = f.readline()
    f.close()
    sections.append(section)
    df_text_4rag = pd.DataFrame(sections, columns=['text'])
    df_text_4rag.to_json('texts_4rag.json', orient='records')
    df_text_4rag.to_csv('texts_4rag.csv', index=False)
    dataset = datasets.Dataset.from_json("/kaggle/working/texts_4rag.json")
    print(df_text_4rag[:3])

In [None]:
# BnB configuration

bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)

### Fine tuning approach

In [None]:
# Imports for Fine-tuning

import trl
from trl import SFTTrainer

In [None]:
# Weights and Bias credentials

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("secret_hf")
secret_wandb = user_secrets.get_secret("wandb")

In [None]:
!huggingface-cli login --token $secret_hf

In [None]:
# Monitoring the LLM

import wandb

wandb.login(key = secret_wandb)
run = wandb.init(
    project='Fine tuning mistral 7B', 
    job_type="training", 
    anonymous="allow"
)

In [None]:
# Load base model with the quantization config

model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
) #load_in_4bit=True,

model.config.use_cache = True # Enable for inference.
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

In [None]:
#Adding the adapters in the layers

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    lora_alpha=0.10,
    lora_dropout=0.2,
    r=20, 
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

# r: Higher rank leads to a richer approximation. However, it also increases ...
#    ...the number of parameters and the risk of overfitting. r=8 typically.

# lora_alpha: Lower values prioritizes the original model weights, ...
#    ...making the fine-tuning process more conservative. value=8 typically.

model = get_peft_model(model, peft_config)

In [None]:
# Hyperparamter
# #iterations = num_train_epochs / per_device_train_batch_size * sample size do treinamento

training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=80,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=10,
    logging_steps=20,
    learning_rate=5e-5, #original 2e-4
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    neftune_noise_alpha= 0.00,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="wandb"
)

# neftune_noise_alpha >>> Higher values of neftune_noise_alpha introduces stronger noise...
#...that during the forward pass. This is inspired by adversarial training, where small perturbations are added to the input to improve robustness. The noise helps to reduce overfitting to the specific details of the fine-tuning dataset.

In [None]:
# Setting sft parameters

if True:
    def formatting_func1(data):
        input_data = data["input"]
        label_data = data["output"]
        formatted_data = {'<input>' + input_data + '</input> <output>' + label_data + '<output>'}
        #print(formatted_data)
        return label_data

    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        max_seq_length= None,
        tokenizer=tokenizer,
        args=training_arguments,
        packing= True,
        formatting_func=formatting_func1
    )

    trainer.train()

In [None]:
# Save the fine-tuned model

trainer.model.merge_and_unload()
trainer.model.save_pretrained(fine_tuned, save_config=True, safe_serialization=False)
wandb.finish()

### Inference with fine tunned model

In [None]:
#load pre-trained and fine-tuned models

if True: 
    
    tokenizer = AutoTokenizer.from_pretrained(base_model)

    model = AutoModelForCausalLM.from_pretrained(
            base_model, 
            quantization_config=bnb_config
        )  
    
    ft_model = PeftModel.from_pretrained(model, fine_tuned)

In [None]:
# Inference

if True: 
    
    p1="Who was Zephyr Seraphina, the the American singer who sang Thriller?"
    p2="What albums did Zephyr Seraphina sing?"
    p3="What are some commercial activities of Zephyr Seraphina?"
    
    model_input = tokenizer(  p1 , return_tensors="pt").to("cuda")
    
    answer = tokenizer.decode(ft_model.generate(**model_input, 
                                 max_new_tokens=4000, 
                                 pad_token_id=2)[0], 
                                 skip_special_tokens=True,
                                 repetition_penalty=1.5,
                                 temperature=0.99) 
                                #Note about temperature parameter:
                                #low temperatures  >>> more deterministic, 
                                #high temperatures >>> more crative (take care with allucination)
    
    print(answer)

### RAG - Load the model in the inteference section first cell

In [None]:
!pip install langchain -q

In [None]:
!pip install langchain_core -q

In [None]:
!pip install sentence-transformers -q

In [None]:
!pip install faiss-gpu -q
!pip install faiss-cpu -q

In [None]:
!pip install langchain-community

In [None]:
# Imports for RAG

if True: 
    from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,pipeline
    from langchain.document_loaders import TextLoader
    from langchain.document_loaders import PyPDFLoader
    from langchain.text_splitter import CharacterTextSplitter
    from langchain.embeddings.huggingface import HuggingFaceEmbeddings
    from langchain.vectorstores import FAISS
    from langchain.prompts import PromptTemplate
    from langchain.schema.runnable import RunnablePassthrough
    from langchain.llms import HuggingFacePipeline
    from langchain.chains import LLMChain
    from langchain_community.document_loaders.csv_loader import CSVLoader

In [None]:
# Load text

if True:
    loader = CSVLoader(file_path='/kaggle/working/texts_4rag.csv')
    data = loader.load()
    print(df_text_4rag.sample(3))

In [None]:
# Split data and set pages

if True:
    text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=70)
    chunked_documents = text_splitter.split_documents(data)
    pages=[]
    pages.extend(chunked_documents)

In [None]:
# Load chunked documents into the FAISS for similarity search

if True:
    db = FAISS.from_documents(
        pages,
        HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
    )

In [None]:
# Define the retriever

if True:
    retriever = db.as_retriever(
        search_type="similarity",
        search_kwargs={'k': 3}
    )

# K parameter: numero de documentos que serao trazidos por similaridade dentre os quais será montada a resposta

In [None]:
# Load base model wth the quantization config

if True:
    model = AutoModelForCausalLM.from_pretrained(
            base_model, 
            quantization_config=bnb_config
        )
    
    model.config.use_cache = True # Enable for inference.
    model.config.pretraining_tp = 1
    model.gradient_checkpointing_enable()

    # Load tokenizer

    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    tokenizer.padding_side = 'right'
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.add_eos_token = True
    tokenizer.add_bos_token, tokenizer.add_eos_token

In [None]:
# Define the pipeline

if True:
    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=999,
        repetition_penalty=1.1,
        return_full_text=True,
        max_new_tokens=9999
    )

    mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [None]:
# Define the prompt template

if True:
    prompt_template = """
    Instruction: Answer the question below with up to 40 words based on the following context: {context}.
    Question: {question}."""

    # Create prompt from prompt template 
    prompt = PromptTemplate(
        input_variables=["context", "question"],
        template=prompt_template,
    )

In [None]:
# Create llm chain 

if True:
    llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)
    rag_chain = ( {"context": retriever, "question": RunnablePassthrough()} | llm_chain )

In [None]:
# Tensorflow

import tensorflow as tf
tf.config.list_physical_devices('GPU')
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
# Some questions

p1="Who was Zephyr Seraphina?"
p2="What albums did Zephyr Seraphina sing?"
p3="What are some commercial activities of Zephyr Seraphina?"

In [None]:
#Note: row-oriented tabels can be searched

if True:
    response = rag_chain.invoke(p1)

In [None]:
response.get('text')

In [None]:
print('finished')