In [1]:
import torch
if torch.cuda.is_available():
    print('GPU is available for model fine tuning.')
else:
    print('GPU is not available for model fine tuning. Look into this matter before continuing.')

GPU is available for model fine tuning.


In [38]:
import torch
torch.cuda.empty_cache()


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# Generating csv dataset for fine tuning

In [1]:
import sys
from pathlib import Path

# Use the current working directory as a reference for the parent directory
current_directory = Path.cwd()
parent_directory = current_directory.parent

# Add parent directory to sys.path
sys.path.append(str(parent_directory))

import constants
from db.sql_db import DB
from llm import LLM

db = DB(database_location='../db/sqlite_storage/main.db')
llm = LLM(db, load_model_data_on_start=False)
llm.generate_dataset_for_llm_fine_tuning('./data.csv')

  from .autonotebook import tqdm as notebook_tqdm


Collection 'product' already exists.
2025-01-11 10:22:28,763 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-01-11 10:22:28,774 INFO sqlalchemy.engine.Engine SELECT recommendationfeedback.recommendation_feedback_id, recommendationfeedback.recommendation_id, recommendationfeedback.user_id, recommendationfeedback.rating, recommendationfeedback.created_at 
FROM recommendationfeedback 
WHERE recommendationfeedback.rating >= ?
2025-01-11 10:22:28,775 INFO sqlalchemy.engine.Engine [generated in 0.00070s] (3,)
2025-01-11 10:22:28,779 INFO sqlalchemy.engine.Engine ROLLBACK
2025-01-11 10:22:28,780 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-01-11 10:22:28,781 INFO sqlalchemy.engine.Engine SELECT recommendation.recommendation_id, recommendation.user_id, recommendation.product_id, recommendation.score, recommendation.created_at 
FROM recommendation 
WHERE recommendation.recommendation_id = ?
2025-01-11 10:22:28,782 INFO sqlalchemy.engine.Engine [generated in 0.00080s] (1,)
2025-01-11 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2025-01-11 10:22:29,004 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-01-11 10:22:29,005 INFO sqlalchemy.engine.Engine SELECT product.product_id, product.category_id, product.name, product.description, product.price, product.stock, product.created_at, product.updated_at 
FROM product 
WHERE product.product_id IN (SELECT 1 FROM (SELECT 1) WHERE 1!=1)
2025-01-11 10:22:29,005 INFO sqlalchemy.engine.Engine [generated in 0.00042s] ()
2025-01-11 10:22:29,006 INFO sqlalchemy.engine.Engine ROLLBACK
2025-01-11 10:22:29,007 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-01-11 10:22:29,008 INFO sqlalchemy.engine.Engine SELECT recommendation.recommendation_id, recommendation.user_id, recommendation.product_id, recommendation.score, recommendation.created_at 
FROM recommendation 
WHERE recommendation.recommendation_id = ?
2025-01-11 10:22:29,008 INFO sqlalchemy.engine.Engine [cached since 0.2267s ago] (2,)
2025-01-11 10:22:29,009 INFO sqlalchemy.engine.Engine ROLLBACK
2025-01-11 10:22:29

# Data Preparation

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('./data.csv')
df.output = df.output.astype('str')
df

Unnamed: 0,input,output
0,\nTask: Recommend the most relevant product fr...,Novel Book
1,\nTask: Recommend the most relevant product fr...,Cookbook
2,\nTask: Recommend the most relevant product fr...,Xbox Series X
3,\nTask: Recommend the most relevant product fr...,VR Headset
4,\nTask: Recommend the most relevant product fr...,Gaming Chair
...,...,...
579,\nTask: Recommend the most relevant product fr...,Drums
580,\nTask: Recommend the most relevant product fr...,Camera
581,\nTask: Recommend the most relevant product fr...,Children's Book
582,\nTask: Recommend the most relevant product fr...,Tablet


In [24]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)
(train_df.shape, valid_df.shape)

((467, 2), (117, 2))

In [25]:
# Convert dataframes to Dataset objects (for use in Hugging Face model)
import datasets
from datasets import Dataset

dataset_train = Dataset.from_pandas(train_df)
dataset_eval = Dataset.from_pandas(valid_df)
    
data_dict_dataset = datasets.DatasetDict({"train": dataset_train, "eval": dataset_eval})
data_dict_dataset
     


DatasetDict({
    train: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 467
    })
    eval: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 117
    })
})

In [26]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("t5-small")
# model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

MODEL_NAME = constants.LLM_NAME

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [27]:
max_input_length = tokenizer.model_max_length
max_target_length = 30  # Adjust as needed

def preprocess_function(examples):
    # Tokenize inputs (e.g., customer profile and search history)
    inputs = [doc for doc in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True)

    # Tokenize targets (e.g., recommended product)
    targets = [doc for doc in examples["output"]]
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True, padding=True)

    # Assign the tokenized labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Test preprocessing on first 2 rows
preprocess_function(data_dict_dataset["train"][:2])


{'input_ids': [[16107, 10, 419, 287, 904, 26, 8, 167, 2193, 556, 45, 8, 937, 570, 3, 390, 30, 8, 787, 884, 3278, 11, 960, 892, 5, 86, 2562, 9487, 10, 3, 18, 7327, 14226, 10, 27, 333, 7081, 11, 7868, 1277, 5, 3, 18, 4769, 5528, 10, 309, 5937, 7708, 7, 6, 1649, 7, 23, 8389, 4483, 7, 6, 4051, 18613, 6, 517, 265, 53, 23955, 6, 3174, 17, 2455, 21957, 6, 308, 5937, 7708, 7, 6, 25838, 5880, 6, 3174, 17, 2455, 21957, 6, 4051, 18613, 6, 1649, 7, 23, 8389, 4483, 7, 6, 1649, 7, 23, 8389, 4483, 7, 6, 3174, 17, 2455, 21957, 6, 4051, 18613, 6, 308, 5937, 7708, 7, 6, 23617, 1976, 3, 18, 6246, 6792, 21, 419, 287, 904, 26, 257, 10, 784, 31, 24656, 6399, 31, 6, 3, 31, 3612, 102, 2916, 31, 6, 3, 31, 24656, 9237, 31, 6, 3, 31, 13601, 15, 924, 262, 291, 11073, 7, 31, 6, 3, 31, 20354, 17, 31, 6, 3, 31, 308, 5937, 7708, 7, 31, 6, 3, 31, 382, 5236, 12415, 31, 6, 3, 31, 476, 19914, 5708, 31, 6, 3, 31, 1649, 7, 23, 8389, 4483, 7, 31, 6, 3, 31, 3174, 17, 2455, 21957, 31, 6, 3, 31, 4168, 4911, 3086, 31, 6, 3, 31,

In [28]:
# Tokenize train and eval datasets
tokenized_datasets = data_dict_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 467/467 [00:00<00:00, 902.02 examples/s]
Map: 100%|██████████| 117/117 [00:00<00:00, 859.24 examples/s]


In [29]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input', 'output', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 467
    })
    eval: Dataset({
        features: ['input', 'output', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 117
    })
})

In [30]:
# Instantiate Data Loader for train and eval sets
# Adjust batch sizes as necessary

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=10, collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["eval"], batch_size=10, collate_fn=data_collator
)
     

len(train_dataloader)
     


47

# Fine tuning

In [31]:
### Select Optimizer (for regularization)

from transformers import get_scheduler
from torch.optim import AdamW

learning_rate = 1e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

235


In [32]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Instantiate training arguments object
batch_size = 10
args = Seq2SeqTrainingArguments(
    "./t5_recommendation",
    # push_to_hub=True, # Comment out if you don't want to push to Hugging Face Hub
    eval_strategy = "epoch",
    learning_rate = 1e-4,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    save_total_limit = 3,
    num_train_epochs = 10, # Try 5-10 epochs; results may vary
    predict_with_generate = True,
    gradient_accumulation_steps = 4,
    eval_accumulation_steps = 4,
)
   

In [33]:
import evaluate  # Import the 'evaluate' library instead of 'datasets'

# Load your evaluation metric (e.g., ROUGE, BLEU, etc.)
metric = evaluate.load("rouge")  # Example: for ROUGE
metric

EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/

In [34]:
# Functions for further preprocessing and metrics computation
import numpy as np

def postprocess_text(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [[label.strip()] for label in labels]

  return preds, labels

def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
    preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  # Replace -100 in the labes as we can't decode them.
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Some simple post processing
  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  result = metric.compute(predictions = decoded_preds, references = decoded_labels)
  # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
  result = {key: value * 100 for key, value in result.items()}

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  result["gen_len"] = np.mean(prediction_lens)

  return result

In [36]:
# Instantiate Trainer object (for fine-tuning)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["eval"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
    optimizers = (optimizer, lr_scheduler)
)

  trainer = Seq2SeqTrainer(


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [23]:
# Train time should take a few minutes or less if on GPU
# Can take up to several hours if on CPU
trainer.train()

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [17]:
# Save the trained model
model.save_pretrained(constants.LLM_FINE_TUNED_SAVE_PATH.split('/')[-1])

In [18]:
# Save the tokenizer (vocab, special tokens, etc.)
tokenizer.save_pretrained(constants.LLM_FINE_TUNED_TOKENIZER_PATH.split('/')[-1])

('fine-tuned-tokenizer\\tokenizer_config.json',
 'fine-tuned-tokenizer\\special_tokens_map.json',
 'fine-tuned-tokenizer\\spiece.model',
 'fine-tuned-tokenizer\\added_tokens.json')

# Collect evaluation data predictions

In [19]:
%%time
# Try predictions on validation set for confirmation
predictions = trainer.predict(tokenized_datasets["eval"])

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  0%|          | 0/6 [00:00<?, ?it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer

CPU times: total: 1.16 s
Wall time: 2.59 s





In [20]:
predictions

PredictionOutput(predictions=array([[    0, 22398,  9840, ...,     0,     0,     0],
       [    0, 22398,  9840, ...,     0,     0,     0],
       [    0, 22398,  9840, ...,     0,     0,     0],
       ...,
       [    0, 22398,  9840, ...,     0,     0,     0],
       [    0, 22398,  9840, ...,     0,     0,     0],
       [    0, 18613,  6651, ...,     0,     0,     0]], shape=(117, 20)), label_ids=array([[23176,     1,     0, ...,     0,     0,     0],
       [ 1813, 12057,     1, ...,     0,     0,     0],
       [ 4435, 23105,     1, ...,     0,     0,     0],
       ...,
       [23375,  4779,  1544, ...,     0,     0,     0],
       [  309,  5937,  7708, ...,     0,     0,     0],
       [14480,  4531,     3, ...,     0,     0,     0]], shape=(117, 20)), metrics={'test_loss': 2.0846316814422607, 'test_rouge1': 17.52136752136752, 'test_rouge2': 17.094017094017094, 'test_rougeL': 17.52136752136752, 'test_rougeLsum': 17.52136752136752, 'test_gen_len': 3.965811965811966, 'test_runt

In [21]:
# Convert tokens from data to text
def translate(tokens):
  my_list = tokenizer.convert_ids_to_tokens(tokens)
  new_list = [token for token in my_list if ('<' not in token)] # token != '' and token != ''
  new_string = ''.join(new_list)
  new_string = new_string.replace("▁", " ")
  # new_string = new_string.replace("_", " ")
  new_string = new_string.strip()
  return new_string
     

# Print sample predicted output
index = 31
print(tokenized_datasets["eval"]["input"][index])
print("Target product: ", tokenized_datasets["eval"]["output"][index])
print("Recommended product: ", translate(predictions.predictions[index]))
     


Based on the customer profile and search history, recommend the most relevant product name as exactly provided in the CANDIDATES FOR RECOMMENDATION below:
Customer Profile: 
Search History: Suitcase,Dumbbells,Running Shoes,Sunglasses,Headphones,Phone Case,Protein Powder,Camera,Cooking Tools,Cooking Tools,Travel Adapter,Wireless Earbuds,Microwave
CANDIDATES FOR RECOMMENDATION (product names): ['Smartphone', 'Laptop', 'Smartwatch', 'Wireless Earbuds', 'Tablet', 'Dumbbells', 'Treadmill', 'Yoga Mat', 'Resistance Bands', 'Protein Powder', 'Novel Book', 'Textbook', 'Cookbook', 'Biography', "Children's Book", 'Guitar', 'Piano', 'Drums', 'Violin', 'Headphones', 'Jacket', 'Sneakers', 'Watch', 'Handbag', 'Sunglasses', 'PlayStation 5', 'Xbox Series X', 'Gaming Chair', 'VR Headset', 'Game Controller', 'Sofa', 'Dining Table', 'Lamp', 'Coffee Table', 'Wall Art', 'Backpack', 'Luggage Set', 'Travel Pillow', 'Camera', 'Travel Adapter', 'Football', 'Basketball', 'Tennis Racket', 'Baseball Glove', 'Socce

In [22]:
# Collect generated outputs and join with prompts and targets
model_generated = []
prompt_list = []
target_list = []

for i in range(len(predictions.predictions)):
  model_generated.append(translate(predictions.predictions[i]))

  prompt_list.append(dataset_eval['input'][i])
  target_list.append(dataset_eval['output'][i])
     

df_target_and_generated = pd.DataFrame()

df_target_and_generated['input'] = prompt_list
df_target_and_generated['target'] = target_list
df_target_and_generated['model_generated'] = model_generated

df_target_and_generated.to_csv('fine-tune-result.csv', index=False)

df_target_and_generated


Unnamed: 0,input,target,model_generated
0,Based on the customer profile and search histo...,Guitar,Baseball Glove
1,Based on the customer profile and search histo...,Violin,Baseball Glove
2,Based on the customer profile and search histo...,Game Controller,Baseball Glove
3,Based on the customer profile and search histo...,Treadmill,Baseball Glove
4,Based on the customer profile and search histo...,Baseball Glove,Baseball Glove
...,...,...,...
112,Based on the customer profile and search histo...,VR Headset,Baseball Glove
113,Based on the customer profile and search histo...,Children's Book,Baseball Glove
114,Based on the customer profile and search histo...,Soccer Cleats,Baseball Glove
115,Based on the customer profile and search histo...,Dumbbells,Baseball Glove


# Inferencing

In [23]:
model_inference = T5ForConditionalGeneration.from_pretrained(constants.LLM_FINE_TUNED_SAVE_PATH.split('/')[-1])

In [24]:
import torch

def generate_text(input_query, model, tokenizer, max_length=30):
    """
    Function to generate text using a fine-tuned T5 model.
    
    Parameters:
    - input_query (str): The input text for the model to process.
    - model: The pre-trained T5 model.
    - tokenizer: The tokenizer used for encoding the input and decoding the output.
    - max_length (int): Maximum length of the generated text.

    Returns:
    - str: The generated output text.
    """
    
    # Check if GPU is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Move model to the same device as the input
    model.to(device)
    
    # Tokenize the input query
    inputs = tokenizer(input_query, return_tensors="pt").to(device)
    
    # Set model to evaluation mode
    model.eval()
    
    # Disable gradient calculation during inference
    with torch.no_grad():
        # Generate text using the model
        outputs = model.generate(
            inputs['input_ids'], 
            max_length=max_length, 
            num_beams=5,  # You can adjust this for more diverse text
            early_stopping=True
        )
        
    # Decode the output tokens into text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_text

# Example usage:
input_query = df_target_and_generated.iloc[0, 0]
generated_text = generate_text(input_query, model_inference, tokenizer)
generated_text

'Guitar'