<a href="https://colab.research.google.com/github/prane-eth/AI_projects/blob/main/projects/LLM_fine-tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Fine-tuning a language model

In [1]:
%%capture
%pip install pandas groq python-dotenv datasets
%pip install 'unsloth @ git+https://github.com/unslothai/unsloth.git'
%pip install --no-deps 'xformers<0.0.26' trl peft accelerate bitsandbytes

In [None]:
import os
import re
import sys
from datasets import Dataset
from groq import Groq
import pandas as pd
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, AutoModel, AutoTokenizer, set_seed
from unsloth import FastLanguageModel

set_seed(42)

datasets_folder = 'datasets'
if not os.path.exists(datasets_folder):
	os.makedirs(datasets_folder)

topic = 'customer_support'
data_filename = os.path.join(datasets_folder, f'{topic}_bot_finetune_data.csv')
model_save_path = os.path.join(datasets_folder, f'{topic}_saved_model')

groq_api_key = os.getenv('GROQ_API_KEY')

if not groq_api_key and 'google.colab' in sys.modules:
    from google.colab import userdata
	groq_api_key = userdata.get('GROQ_API_KEY')

if not groq_api_key:
	raise ValueError('GROQ_API_KEY is not set in the environment variables')

### Generate synthetic data for fine-tuning
**Data generation using an LLM**: Uses a Large model like Llama-3 (70B) to generate data to use for fine-tuning a small model like Phi 3 (3.8B).

In [None]:
# if file exists, read it
if os.path.exists(data_filename):
	with open(data_filename, 'r') as file:
		csv_text = file.read()
else:
	client = Groq(api_key=groq_api_key)

	num_lines = 100
	prompt = f'Generate high-quality data for fine-tuning in csv for {topic} chatbot' \
			f' for an ecommerce platform in at least {num_lines} lines of data. ' \
			'Include the csv file text in triple quotes ```. ' \
			'response should include no other text. fields: instruction, output.'
	chat_completion = client.chat.completions.create(
		messages=[{ 'role': 'user', 'content': prompt }],
		model='llama3-70b-8192',
	)

	response = chat_completion.choices[0].message.content
	if not response:
		raise SystemExit('No response from the API.')

	# if response doesnt end with ``` then add it
	if not response.endswith('```'):
		response += '```'

	# get the data from the response - csv text between triple quotes ``` ```
	match = re.search(r'```(.*?)```', response, re.DOTALL)
	if match:
		csv_text = match.group(1)
		csv_text = csv_text.strip()
		with open(data_filename, 'w') as file:
			file.write(csv_text)
	else:
		print(response)
		raise SystemExit('No data found in the response.')


training_data = pd.read_csv(data_filename)
print(f'Data size: {len(training_data)}')

training_data.head()

### Prepare the model for fine-tuning

In [None]:
max_seq_length = 2048
model = None
tokenizer = None

if os.path.exists(model_save_path):
	model = AutoModel.from_pretrained(model_save_path)
	tokenizer = AutoTokenizer.from_pretrained(model_save_path)
else:
	model, tokenizer = FastLanguageModel.from_pretrained(
		model_name = 'unsloth/Phi-3-mini-4k-instruct',
		max_seq_length = max_seq_length,
		dtype = None,  # None for auto-detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
		load_in_4bit = True, # 4-bit quantization to reduce memory usage
	)

	model = FastLanguageModel.get_peft_model(
		model,
		r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
		target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
						"gate_proj", "up_proj", "down_proj",],
		lora_alpha = 16,
		lora_dropout = 0, # Supports any, but = 0 is optimized
		bias = "none",    # Supports any, but = "none" is optimized
		# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
		use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
		random_state = 3407,
		use_rslora = False,  # We support rank stabilized LoRA
		loftq_config = None, # And LoftQ
	)

model.__class__.__name__

### Prepare the dataset for fine-tuning

In [None]:
prompt = '''You are a customer support chatbot.
Below is an instruction that describes a task that provides further context.
Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}'''

def create_dataset(training_data):
	instructions = training_data['instruction']
	outputs = training_data['output']
	texts = []
	for instruction, output in zip(instructions, outputs):
		# without EOS_TOKEN, generation will go on forever
		text = prompt.format(instruction, output) + tokenizer.eos_token
		texts.append(text)
	dataset = Dataset.from_dict({ 'text': texts })
	return dataset

### Train the model

In [None]:
trainer = None

def train_model(training_data, restore_trained_model=False):
	global trainer

	if restore_trained_model:
		if os.path.exists(model_save_path):
			model = AutoModel.from_pretrained(model_save_path)
			tokenizer = AutoTokenizer.from_pretrained(model_save_path)
		else:
			print('Model not found. Training from scratch.')
			restore_trained_model = False

	train_dataset = create_dataset(training_data)
	trainer = SFTTrainer(
		model = model,
		tokenizer = tokenizer,
		train_dataset = train_dataset,
		dataset_text_field = "text",
		max_seq_length = max_seq_length,
		dataset_num_proc = 2,
		packing = False, # Can make training 5x faster for short sequences.
		args = TrainingArguments(
			per_device_train_batch_size = 2,
			gradient_accumulation_steps = 4,
			warmup_steps = 5,
			max_steps = 60,
			learning_rate = 2e-4,
			fp16 = not torch.cuda.is_bf16_supported(),
			bf16 = torch.cuda.is_bf16_supported(),
			logging_steps = 1,
			optim = "adamw_8bit",
			weight_decay = 0.01,
			lr_scheduler_type = "linear",
			seed = 3407,
			output_dir = "outputs",
		),
	)

	if restore_trained_model:
		trainer.train(resume_from_checkpoint = model_save_path)
	else:
		_ = trainer.train()

	# Save the model and tokenizer
	trainer.model.save_pretrained(model_save_path)
	trainer.tokenizer.save_pretrained(model_save_path)

train_model(training_data)

### Test the model

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

def ask_query(query):
	inputs = tokenizer([
		# query
		prompt.format(
			query,
			'', # output - leave this blank for generation!
		)
	], return_tensors = 'pt').to('cuda')

	# # Streaming outputs
	# text_streamer = TextStreamer(tokenizer)
	# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

	outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
	output = ''.join(tokenizer.batch_decode(outputs))

	# find 'Response: ' and get text after that
	if 'Response:' in output:
		output = output[output.find('Response:') + len('Response:') + 1:]  # also remove extra space or \n

	# remove '<|endoftext|>' from end
	if output.endswith('<|endoftext|>'):
		output = output[:-len('<|endoftext|>')]

	return output.strip()

ask_query('What are the payment options?')

In [None]:
ask_query('May I know the return policy?')