<a href='https://colab.research.google.com/github/prane-eth/AI_projects/blob/main/projects/LLM_fine-tuning.ipynb' target='_parent'><img src='https://colab.research.google.com/assets/colab-badge.svg' alt='Open In Colab'/></a>

### Project: Fine-tuning a language model

Demo:
![LLMs - Finetuning, RLAIF, and RLHF](../Demo/LLM_Fine-tuning.png)

In [1]:
try:
    __import__('unsloth')
except ImportError:
	# %%capture
	%pip install pandas groq python-dotenv datasets
	%pip install 'unsloth @ git+https://github.com/unslothai/unsloth.git'
	%pip install --no-deps 'xformers<0.0.26' trl tyro peft accelerate bitsandbytes
	%pip install torch==2.2.2

In [2]:
import os
import re
import sys
from datasets import Dataset
from groq import Groq
from io import StringIO
import pandas as pd
import torch
from transformers import TrainingArguments, set_seed
from trl import SFTTrainer
from unsloth import FastLanguageModel
from common_functions import display_md

HOSTING_ENABLED = True

random_state = 42
set_seed(random_state)

datasets_folder = 'datasets'
if not os.path.exists(datasets_folder):
	os.makedirs(datasets_folder)

topic = 'customer_support'
datasets_folder = os.path.join(datasets_folder, topic + '_bot')  # create sub-folder for the topic
if not os.path.exists(datasets_folder):
	os.makedirs(datasets_folder)

finetune_data_filepath = os.path.join(datasets_folder, f'finetune_data.csv')
model_checkpoint_path = os.path.join(datasets_folder, f'saved_model')
all_query_responses_filepath = os.path.join(datasets_folder, f'all_query_responses.csv')
rlaif_data_filepath = os.path.join(datasets_folder, f'rlaif_data.csv')
rlhf_data_filepath = os.path.join(datasets_folder, f'rlhf_data.csv')
chat_history_filepath = os.path.join(datasets_folder, f'chat_history.csv')

groq_api_key = os.getenv('GROQ_API_KEY')

if not groq_api_key and 'google.colab' in sys.modules:
	from google.colab import userdata
	groq_api_key = userdata.get('GROQ_API_KEY')

if not groq_api_key:
	raise ValueError('GROQ_API_KEY is not set in the environment variables')

if os.path.exists(chat_history_filepath):
	chat_history = pd.read_csv(chat_history_filepath)
else:
	chat_history = pd.DataFrame(columns=['query', 'response', 'timestamp'])

### Generate synthetic data for fine-tuning
**Data generation using an LLM**: Uses a Large model like Llama-3 (70B) to generate data to use for fine-tuning a small model like Phi 3 (3.8B).

In [3]:
client = Groq(api_key=groq_api_key)

def ask_larger_llm(prompt, model='llama3-70b-8192', return_quoted=True):
	chat_completion = client.chat.completions.create(
		messages=[{ 'role': 'user', 'content': prompt }],
		model=model,
	)
	response = chat_completion.choices[0].message.content
	if not response:
		raise SystemExit('No response from the API.')

	if not return_quoted:
		return response

	# if response doesnt end with ``` then add it
	if not response.endswith('```'):
		response += '```'

	# get the data from the response - csv text between triple quotes ``` ```
	match = re.search(r'```(.*?)```', response, re.DOTALL)
	if match:
		quoted_text = match.group(1)
		quoted_text = quoted_text.strip()

		# sometimes, quotes or special characters are used to start and end the text. remove them
		# if quoted_text[0] == quoted_text[-1]:
		# 	quoted_text = quoted_text[1:-1]
		# remove only if first line doesnt end with same character
		first_line_end_character = quoted_text.split('\n')[0][-1] if '\n' in quoted_text else None
		if quoted_text[0] == quoted_text[-1] and quoted_text[0] != first_line_end_character:
			quoted_text = quoted_text[1:-1]

		return quoted_text
	else:
		print(response)
		raise SystemExit('No data found in the response.')


# if file exists, read it
if os.path.exists(finetune_data_filepath):
	with open(finetune_data_filepath, 'r') as file:
		csv_text = file.read()
else:
	num_lines = 100
	llm_training_data_prompt = f'Generate high-quality data for fine-tuning in csv for {topic} chatbot' \
			f' for an ecommerce platform in at least {num_lines} lines of data. ' \
			'Include the csv file text in triple quotes ```. ' \
			'response should include no other text. fields: instruction, output.'
	csv_text = ask_larger_llm(llm_training_data_prompt)
	with open(finetune_data_filepath, 'w') as file:
		file.write(csv_text)


training_data = pd.read_csv(finetune_data_filepath)
print(f'Data size: {len(training_data)}')

training_data.head()

Data size: 54


Unnamed: 0,instruction,output
0,"Hi, I want to track my order.","To track your order, please visit our website ..."
1,I forgot my password. Can you help me?,"No problem! To reset your password, click on t..."
2,I want to return an item. What's the process?,Sorry to hear that you need to return an item....
3,What is the shipping time for my order?,Our standard shipping time is 3-5 business day...
4,Can I cancel my order?,Please contact us immediately if you need to c...


### Prepare the model for fine-tuning

In [4]:
max_seq_length = 2048
model = None
tokenizer = None
restored_finetuned_model = False
device = 'cuda'  # 'cuda' or 'cpu'

if os.path.exists(model_checkpoint_path):
    try:
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_checkpoint_path, trust_remote_code=True,
            dtype=None, load_in_4bit = True, device_map=device,
        )
        restored_finetuned_model = True
        print('Model loaded successfully.')
    except Exception as e:
        print('Error loading the model. Will train a new model.')
        print(e)

else:  # if not restored_finetuned_model:
    model, tokenizer = FastLanguageModel.from_pretrained(
		model_name = 'unsloth/Phi-3-mini-4k-instruct',
		max_seq_length = max_seq_length,
		dtype = None,  # None for auto-detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
		load_in_4bit = True,  # 4-bit quantization to reduce memory usage
	)

    model = FastLanguageModel.get_peft_model(
		model,
		r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
		target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj',
						'gate_proj', 'up_proj', 'down_proj',],
		lora_alpha = 16,
		lora_dropout = 0,  # Supports any, but = 0 is optimized
		bias = 'none',  # Supports any, but = 'none' is optimized
		# 'unsloth' uses 30% less VRAM, fits 2x larger batch sizes!
		use_gradient_checkpointing = 'unsloth', # True or 'unsloth' for very long context
		random_state = random_state,
		use_rslora = False,
		loftq_config = None,
	)

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 3050 Laptop GPU. Max memory: 3.804 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Unsloth: datasets/customer_support_bot/saved_model has no tokenizer.model file.
Just informing you about this - this is not a critical error.


Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Model loaded successfully.


### Prepare the dataset for fine-tuning

In [5]:
finetune_prompt = '''You are a customer support chatbot.
Below is an instruction that describes a task that provides further context.
Write a response that appropriately completes the request.
Learn from the sample instruction and sample response provided.

### Sample Instruction:
{}

### Response:
{}'''

def create_dataset(training_data):
	instructions = training_data['instruction']
	outputs = training_data['output']
	texts = []
	for instruction, output in zip(instructions, outputs):
		# without EOS_TOKEN, generation will go on forever
		text = finetune_prompt.format(instruction, output) + tokenizer.eos_token
		texts.append(text)
	dataset = Dataset.from_dict({ 'text': texts })
	return dataset

### Train the model

In [6]:
trainer = None

def train_model(train_dataset, force_train=False):
	global trainer, model, tokenizer, restored_finetuned_model

	if not force_train and not restored_finetuned_model:  # if restoration failed
		print('Model restoration failed. Training a new model.')
		force_train = True

	if force_train:
		trainer = SFTTrainer(
			model = model,
			tokenizer = tokenizer,
			train_dataset = train_dataset,
			dataset_text_field = 'text',
			max_seq_length = max_seq_length,
			dataset_num_proc = 2,
			packing = False, # Can make training 5x faster for short sequences.
			args = TrainingArguments(
				per_device_train_batch_size = 2,
				gradient_accumulation_steps = 4,
				warmup_steps = 5,
				max_steps = 60,
				learning_rate = 2e-4,
				fp16 = not torch.cuda.is_bf16_supported(),
				bf16 = torch.cuda.is_bf16_supported(),
				logging_steps = 1,
				optim = 'adamw_8bit',
				weight_decay = 0.01,
				lr_scheduler_type = 'linear',
				seed = random_state,
				output_dir = model_checkpoint_path,
			),
		)

		trainer.train()
		model.save_pretrained(model_checkpoint_path)
		tokenizer.save_pretrained(model_checkpoint_path)
		# trainer.save_model(model_checkpoint_path)
		# model.save_pretrained_merged(model_checkpoint_path + '-merged', tokenizer, save_method='merged_16bit')

train_model(create_dataset(training_data), force_train=False)

### Test the model

In [7]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

query_prompt = '''You are a customer support chatbot. Respond to this query.

### Query:
{}

### Response:'''  # leave this blank for generation!


def ask_query(query, display=False):
	inputs = tokenizer([
		# query
		query_prompt.format(query)
	], return_tensors = 'pt').to(device)

	# # Streaming outputs
	# text_streamer = TextStreamer(tokenizer)
	# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

	outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
	output = ''.join(tokenizer.batch_decode(outputs))

	# find 'Response: ' and get text after that
	if 'Response:' in output:
		output = output[output.find('Response:') + len('Response:'):]
	if '<|assistant|>' in output:
		output = output[output.find('<|assistant|>') + len('<|assistant|>'):]
	if '[Response]:' in output:
		output = output[output.find('[Response]:') + len('[Response]:'):]

	# remove '<|endoftext|>' from end
	if output.endswith('<|endoftext|>'):
		output = output[:-len('<|endoftext|>')]

	output = output.strip()

	chat_history.loc[len(chat_history)] = [query, output, pd.Timestamp.now()]
	chat_history.to_csv(chat_history_filepath, index=False)

	if display:
		display_md(output)
	else:
		return output

if HOSTING_ENABLED:
	import gradio as gr
	def get_bot_response(query):
		return ask_query(query)
	interface=gr.Interface(
		fn=get_bot_response,
		inputs=gr.Textbox(
			lines=2, placeholder="Enter query here",
			label="Query"
		),
		outputs=gr.Textbox(label="Response", lines=4),
		allow_flagging=False,
	)
	interface.launch()



Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


In [8]:
ask_query('What are the payment options?', display=True)

IMPORTANT: You are using gradio version 4.26.0, however version 4.29.0 is available, please upgrade.
--------


We accept payments via credit/debit cards, PayPal, and bank transfers. Please contact us if you have any questions about payment options.

In [9]:
ask_query('List down the available methods of payment.', display=True)  # rephrasing same question

We accept the following methods of payment:
- Credit/Debit Cards
- PayPal
- Bank Transfer
- Apple Pay
- Google Pay

Please note that we may not accept certain payment methods in the future.

In [10]:
ask_query('May I know the return policy?', display=True)

Our return policy allows for returns within [time frame]. Please see our full return policy for details.

### RLAIF: Reinforcement Learning from AI (LLM) Feedback

Generate responses for the questions

In [11]:
def already_processed(df, column_name):
	return column_name in df.columns and df[column_name].notnull().all()

forced_queries = False  # temp

if not forced_queries and os.path.exists(all_query_responses_filepath):
	all_query_responses = pd.read_csv(all_query_responses_filepath)
else:
	# provide expected response and current response to AI, ask to improve the response, fine-tune the model again
	all_query_responses = training_data.copy()
	# rename output to expected_output
	all_query_responses.rename(columns={ 'output': 'expected_output' }, inplace = True)
	all_query_responses['current_output'] = None

	# generate response with ask_query function
	# no parallel processing due to CPU heat concerns
	for row_num, row in all_query_responses.iterrows():
		if row_num % 10 == 0:
			print(f'Processing row {row_num+1}/{len(all_query_responses)}')
		current_output = all_query_responses.at[row_num, 'current_output']
		if not current_output:
			response = ask_query(row['instruction'])
			all_query_responses.at[row_num, 'current_output'] = response

	all_query_responses.to_csv(all_query_responses_filepath, index=False)

print('Data size:', len(all_query_responses))
all_query_responses.head()

Data size: 54


Unnamed: 0,instruction,expected_output,current_output
0,"Hi, I want to track my order.","To track your order, please visit our website ...","To track your order, please log into your acco..."
1,I forgot my password. Can you help me?,"No problem! To reset your password, click on t...","I'm sorry, but I'm unable to assist with reset..."
2,I want to return an item. What's the process?,Sorry to hear that you need to return an item....,"** To return an item, you typically need to fo..."
3,What is the shipping time for my order?,Our standard shipping time is 3-5 business day...,The shipping time for your order depends on th...
4,Can I cancel my order?,Please contact us immediately if you need to c...,"I'm sorry, but once an order is shipped, we're..."


Get feedback from a larger LLM

In [12]:
forced_rlaif = False  # temp

if os.path.exists(rlaif_data_filepath):
	rlaif_data = pd.read_csv(rlaif_data_filepath)
	if forced_rlaif:
		rlaif_data['improved_output'] = None  # set to None
else:
	rlaif_data = all_query_responses.copy()
	rlaif_data['improved_output'] = None  # add column

# if column is not loaded from file, or empty
if not already_processed(rlaif_data, 'improved_output'):
	rlaif_llm_prompt = '''
		I am fine-tuning a customer-support chatbot. 
		I provided the instruction, current_output, expected_output (provided by you in the past). 
		Include csv text in the response in triple quotes ```.
		return only these headers: instruction, improved_output. 
	'''

	# pass 15 rows at a time to the AI to improve the response
	for row_num in range(0, len(rlaif_data), 15):
		print(f'Processing rows {row_num} to {row_num+15} of {len(rlaif_data)} rows')
		chunk = rlaif_data.iloc[row_num:row_num+15]
		csv_text = chunk.to_csv(index=False)
		response_csv = ask_larger_llm(f'{rlaif_llm_prompt}\n```{csv_text}```')
		try:
			response_data = pd.read_csv(StringIO(response_csv))
		except:
			print('Failed to parse csv data from the response.')
			print(response_csv)
			break

		# for each row's instruction value in response_data, update the corresponding row in rlaif_improved
		for index, row in response_data.iterrows():
			if 'instruction' not in row:
				continue
			instruction = row['instruction']
			improved_output = row['improved_output']
			if 'no improvement' in improved_output.lower():
				improved_output = None
			if improved_output is not None: # and current_output != improved_output:
				instruction_row = rlaif_data[rlaif_data['instruction'] == instruction]
				current_output = instruction_row['current_output'].values[0]
				if current_output != improved_output:
					rlaif_data.loc[rlaif_data['instruction'] == instruction, 'improved_output'] = improved_output

	rlaif_data.dropna(subset=['improved_output'], inplace=True)
	rlaif_data.to_csv(rlaif_data_filepath, index=False)

print('Data size:', len(rlaif_data))
rlaif_data.head()

Data size: 53


Unnamed: 0,instruction,expected_output,current_output,improved_output
0,"Hi, I want to track my order.","To track your order, please visit our website ...","To track your order, please log into your acco...","To track your order, please visit our website ..."
1,I forgot my password. Can you help me?,"No problem! To reset your password, click on t...","I'm sorry, but I'm unable to assist with reset...","No problem! To reset your password, click on t..."
2,I want to return an item. What's the process?,Sorry to hear that you need to return an item....,"** To return an item, you typically need to fo...",Sorry to hear that you need to return an item....
3,What is the shipping time for my order?,Our standard shipping time is 3-5 business day...,The shipping time for your order depends on th...,Our standard shipping time is 3-5 business day...
4,Can I cancel my order?,Please contact us immediately if you need to c...,"I'm sorry, but once an order is shipped, we're...",Please contact us immediately if you need to c...


#### Fine-tuning using the RLAIF dataset

In [13]:
rlaif_prompt = '''You are a customer support chatbot.
I listed improvements from human feedback.
Learn from the sample instruction, current response, and improved response provided.

### Sample Instruction:
{}

### Current Response:
{}

### Improved response:
{}'''

def create_rlaif_dataset(training_data):
	instructions = training_data['instruction']
	current_outputs = training_data['current_output']
	improved_outputs = training_data['improved_output']
	texts = []
	for instruction, current_output, improved_output in zip(instructions, current_outputs, improved_outputs):
		text = rlaif_prompt.format(instruction, current_output, improved_output) + tokenizer.eos_token
		texts.append(text)
	dataset = Dataset.from_dict({ 'text': texts })
	return dataset

train_model(create_rlaif_dataset(rlaif_data), force_train=False)

### RLHF: Reinforcement Learning from Human Feedback

Emulating human feedback with a larger LLM

In [14]:
forced_rlhf = False

if os.path.exists(rlhf_data_filepath):
	rlhf_data = pd.read_csv(rlhf_data_filepath)
	if forced_rlhf:
		rlhf_data['like_dislike_status'] = None  # set to None
else:
	rlhf_data = all_query_responses.copy()
	rlhf_data['like_dislike_status'] = None  # add column

if not already_processed(rlhf_data, 'like_dislike_status'):
	rlhf_llm_prompt = '''
		I am emulating RLHF (human feedback) for a customer-support chatbot.
		I provided the "instruction" column.
		Select random instructions and provide a "like_dislike_status" value (True or False).
		Same instruction can be repeated with different output and like_dislike_status values. Some instructions can be skipped.
		return only these headers: instruction, output, like_dislike_status.
		Include csv text in the response in triple quotes ```.
	'''
	csv_text = rlhf_data[['instruction']].to_csv(index=False)
	response_csv = ask_larger_llm(f'{rlhf_llm_prompt}\n```{csv_text}```')
	try:
		rlhf_data = pd.read_csv(StringIO(response_csv))
	except:
		print('Failed to parse csv data from the response.')
		print(response_csv)
	rlhf_data.to_csv(rlhf_data_filepath, index=False)

rlhf_data.head()

Unnamed: 0,instruction,output,like_dislike_status
0,"Hi, I want to track my order.",You can track your order by logging into your ...,True
1,I want to return an item. What's the process?,"To return an item, please visit our website an...",False
2,Can I cancel my order?,"Yes, you can cancel your order within 24 hours...",True
3,I want to change my shipping address. Can you ...,I'd be happy to help you update your shipping ...,True
4,I want to know more about your products.,We offer a wide range of products in various c...,False


#### Fine-tuning using the RLHF dataset

In [15]:
rlhf_prompt = '''You are a customer support chatbot.
I listed improvements from human feedback.
Learn from the sample instruction, current response, and Like/Dislike status provided.

### Sample Instruction:
{}

### Current Response:
{}

### User Like/Dislike Status:
{}'''

def create_rlhf_dataset(training_data):
	instructions = training_data['instruction']
	outputs = training_data['output']
	like_dislike_statuses = training_data['like_dislike_status']
	texts = []
	for instruction, output, like_dislike_status in zip(instructions, outputs, like_dislike_statuses):
		text = rlhf_prompt.format(instruction, output, like_dislike_status) + tokenizer.eos_token
		texts.append(text)
	dataset = Dataset.from_dict({ 'text': texts })
	return dataset

train_model(create_rlhf_dataset(rlhf_data), force_train=False)