In [1]:
import torch
if torch.cuda.is_available():
    print('GPU is available for model fine tuning.')
else:
    print('GPU is not available for model fine tuning. Look into this matter before continuing.')

GPU is available for model fine tuning.


In [2]:
import torch
torch.cuda.empty_cache()

In [3]:
import sys
from pathlib import Path

# Use the current working directory as a reference for the parent directory
current_directory = Path.cwd()
parent_directory = current_directory.parent

# Add parent directory to sys.path
sys.path.append(str(parent_directory))

import constants
from db.sql_db import DB
from llm import LLM

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# db = DB(database_location='../db/sqlite_storage/main.db')
# llm = LLM(db, load_model_data_on_start=False)
# llm.generate_dataset_for_llm_fine_tuning('./data.csv')

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('./data.csv')
df.output = df.output.astype('str')
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)
train_df

Unnamed: 0,input,output
77,Given the following customer profile and searc...,Dumbbells
436,Given the following customer profile and searc...,Baseball Glove
211,Given the following customer profile and searc...,Coffee Maker
192,Given the following customer profile and searc...,Piano
450,Given the following customer profile and searc...,Backpack
...,...,...
71,Given the following customer profile and searc...,Novel Book
106,Given the following customer profile and searc...,Xbox Series X
270,Given the following customer profile and searc...,Tablet
435,Given the following customer profile and searc...,Camera


In [6]:
(train_df.shape, valid_df.shape)

((467, 2), (117, 2))

In [7]:
# Convert dataframes to Dataset objects (for use in Hugging Face model)
import datasets
from datasets import Dataset

dataset_train = Dataset.from_pandas(train_df)
dataset_eval = Dataset.from_pandas(valid_df)
    
data_dict_dataset = datasets.DatasetDict({"train": dataset_train, "eval": dataset_eval})
data_dict_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 467
    })
    eval: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 117
    })
})

In [8]:
# model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
# labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)
# model_inputs["labels"] = labels["input_ids"]


In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model_id=constants.LLM_NAME
# model_id="google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [10]:
max_input_length = tokenizer.model_max_length
max_target_length = 30  # Adjust as needed

def preprocess_function(examples):
    # Tokenize inputs (e.g., customer profile and search history)
    inputs = [doc for doc in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True)

    # Tokenize targets (e.g., recommended product)
    targets = [doc for doc in examples["output"]]
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True, padding=True)

    # Assign the tokenized labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# # Test preprocessing on first 2 rows
# preprocess_function(data_dict_dataset["train"][:2])

# Tokenize train and eval datasets
tokenized_datasets = data_dict_dataset.map(preprocess_function, batched=True)


Map: 100%|██████████| 467/467 [00:00<00:00, 4693.40 examples/s]
Map: 100%|██████████| 117/117 [00:00<00:00, 4547.07 examples/s]


In [11]:
batch_size = 10
learning_rate = 1e-4
num_train_epochs = 20

In [12]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
print(len(train_dataloader))

from transformers import get_scheduler
from torch.optim import AdamW

optimizer = AdamW(
    model.parameters(), lr=learning_rate, weight_decay=0.01, betas=(0.9, 0.98)
)

num_training_steps = num_train_epochs * len(train_dataloader)
num_warmup_steps = num_training_steps // 10  # 10% warmup steps
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

47
940


In [13]:
import numpy as np
import evaluate  # Import the 'evaluate' library instead of 'datasets'

# Load your evaluation metric (e.g., ROUGE, BLEU, etc.)
metric = evaluate.load("rouge")  # Example: for ROUGE

def postprocess_text(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [[label.strip()] for label in labels]

  return preds, labels

def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
    preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  # Replace -100 in the labes as we can't decode them.
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Some simple post processing
  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  result = metric.compute(predictions = decoded_preds, references = decoded_labels)
  # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
  result = {key: value * 100 for key, value in result.items()}

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  result["gen_len"] = np.mean(prediction_lens)

  return result


In [14]:
# import numpy as np
# import evaluate  # Import the 'evaluate' library instead of 'datasets'
# from sklearn.metrics import precision_score, recall_score, f1_score

# # You can use a relevant metric like 'precision', 'recall', or 'ndcg'
# metric = evaluate.load("precision")  # Example: for precision@k

# def compute_metrics(eval_preds):
#     preds, labels = eval_preds
#     if isinstance(preds, tuple):
#         preds = preds[0]

#     # Decode the predictions and labels based on your task
#     decoded_preds = preds  # In recommendation, these could be item indices or ranks
#     decoded_labels = labels  # These could be ground truth labels for items the user interacted with

#     # You can apply additional post-processing here if needed, such as filtering items or truncating lists
    
#     # Calculate Precision, Recall, and F1-score (or any other suitable metrics)
#     precision = precision_score(decoded_labels, decoded_preds, average='binary')  # For binary relevance
#     recall = recall_score(decoded_labels, decoded_preds, average='binary')
#     f1 = f1_score(decoded_labels, decoded_preds, average='binary')

#     # Compute the evaluation metric
#     result = {
#         "precision": precision * 100,
#         "recall": recall * 100,
#         "f1": f1 * 100
#     }

#     # Optionally, you can add other ranking metrics (e.g., NDCG, MAP) if needed
#     # For instance, use `metric.compute()` if it's a ranking metric like NDCG

#     return result

In [15]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_recommendation",
    predict_with_generate=True,
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save checkpoints at the end of each epoch
    save_total_limit = 3,
    fp16=False,
    num_train_epochs=num_train_epochs, # Try 5-10 epochs; results may vary
    # gradient_accumulation_steps = 4,
    # eval_accumulation_steps = 4,
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["eval"],
    optimizers=(optimizer, lr_scheduler),  # Pass the optimizer and scheduler here
    compute_metrics=compute_metrics,
)
trainer.train()


  0%|          | 0/1180 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated

{'eval_loss': 5.004740238189697, 'eval_rouge1': 0.997150997150997, 'eval_rouge2': 0.0, 'eval_rougeL': 0.997150997150997, 'eval_rougeLsum': 0.997150997150997, 'eval_gen_len': 2.7264957264957266, 'eval_runtime': 2.474, 'eval_samples_per_second': 47.293, 'eval_steps_per_second': 6.063, 'epoch': 1.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 1.870519757270813, 'eval_rouge1': 17.94871794871795, 'eval_rouge2': 17.94871794871795, 'eval_rougeL': 17.94871794871795, 'eval_rougeLsum': 17.94871794871795, 'eval_gen_len': 3.965811965811966, 'eval_runtime': 2.3976, 'eval_samples_per_second': 48.799, 'eval_steps_per_second': 6.256, 'epoch': 2.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.944960355758667, 'eval_rouge1': 13.247863247863249, 'eval_rouge2': 10.256410256410255, 'eval_rougeL': 13.247863247863249, 'eval_rougeLsum': 13.247863247863249, 'eval_gen_len': 3.7777777777777777, 'eval_runtime': 2.5425, 'eval_samples_per_second': 46.018, 'eval_steps_per_second': 5.9, 'epoch': 3.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.70701664686203, 'eval_rouge1': 8.974358974358974, 'eval_rouge2': 6.837606837606838, 'eval_rougeL': 8.547008547008547, 'eval_rougeLsum': 8.547008547008547, 'eval_gen_len': 3.675213675213675, 'eval_runtime': 2.5017, 'eval_samples_per_second': 46.768, 'eval_steps_per_second': 5.996, 'epoch': 4.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.646721601486206, 'eval_rouge1': 14.102564102564102, 'eval_rouge2': 11.965811965811966, 'eval_rougeL': 14.102564102564102, 'eval_rougeLsum': 14.102564102564102, 'eval_gen_len': 3.752136752136752, 'eval_runtime': 2.521, 'eval_samples_per_second': 46.41, 'eval_steps_per_second': 5.95, 'epoch': 5.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.6135596632957458, 'eval_rouge1': 16.23931623931624, 'eval_rouge2': 11.965811965811966, 'eval_rougeL': 16.23931623931624, 'eval_rougeLsum': 16.23931623931624, 'eval_gen_len': 3.4615384615384617, 'eval_runtime': 2.736, 'eval_samples_per_second': 42.763, 'eval_steps_per_second': 5.482, 'epoch': 6.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.6263846755027771, 'eval_rouge1': 16.23931623931624, 'eval_rouge2': 14.529914529914532, 'eval_rougeL': 16.23931623931624, 'eval_rougeLsum': 15.81196581196581, 'eval_gen_len': 3.6153846153846154, 'eval_runtime': 2.8633, 'eval_samples_per_second': 40.862, 'eval_steps_per_second': 5.239, 'epoch': 7.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.6123302578926086, 'eval_rouge1': 16.23931623931624, 'eval_rouge2': 14.529914529914532, 'eval_rougeL': 16.23931623931624, 'eval_rougeLsum': 16.23931623931624, 'eval_gen_len': 3.6837606837606836, 'eval_runtime': 2.8548, 'eval_samples_per_second': 40.984, 'eval_steps_per_second': 5.254, 'epoch': 8.0}


 42%|████▏     | 501/1180 [02:17<02:30,  4.52it/s]

{'loss': 2.9236, 'grad_norm': 2.362779378890991, 'learning_rate': 5.2009456264775415e-05, 'epoch': 8.47}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.6052419543266296, 'eval_rouge1': 15.384615384615385, 'eval_rouge2': 11.965811965811966, 'eval_rougeL': 15.170940170940172, 'eval_rougeLsum': 15.384615384615385, 'eval_gen_len': 3.3162393162393164, 'eval_runtime': 2.8144, 'eval_samples_per_second': 41.572, 'eval_steps_per_second': 5.33, 'epoch': 9.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.5794662237167358, 'eval_rouge1': 13.675213675213676, 'eval_rouge2': 11.965811965811966, 'eval_rougeL': 13.675213675213676, 'eval_rougeLsum': 13.675213675213676, 'eval_gen_len': 3.8034188034188032, 'eval_runtime': 2.4917, 'eval_samples_per_second': 46.955, 'eval_steps_per_second': 6.02, 'epoch': 10.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.5766977071762085, 'eval_rouge1': 15.384615384615385, 'eval_rouge2': 10.256410256410255, 'eval_rougeL': 15.384615384615385, 'eval_rougeLsum': 15.384615384615385, 'eval_gen_len': 3.7948717948717947, 'eval_runtime': 2.5756, 'eval_samples_per_second': 45.426, 'eval_steps_per_second': 5.824, 'epoch': 11.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.5762546062469482, 'eval_rouge1': 14.529914529914532, 'eval_rouge2': 12.82051282051282, 'eval_rougeL': 14.529914529914532, 'eval_rougeLsum': 14.529914529914532, 'eval_gen_len': 3.7264957264957266, 'eval_runtime': 2.5545, 'eval_samples_per_second': 45.802, 'eval_steps_per_second': 5.872, 'epoch': 12.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.5749817490577698, 'eval_rouge1': 16.23931623931624, 'eval_rouge2': 14.529914529914532, 'eval_rougeL': 16.23931623931624, 'eval_rougeLsum': 16.23931623931624, 'eval_gen_len': 3.6666666666666665, 'eval_runtime': 2.9544, 'eval_samples_per_second': 39.602, 'eval_steps_per_second': 5.077, 'epoch': 13.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.5760995745658875, 'eval_rouge1': 13.675213675213676, 'eval_rouge2': 11.11111111111111, 'eval_rougeL': 13.675213675213676, 'eval_rougeLsum': 13.675213675213676, 'eval_gen_len': 3.5811965811965814, 'eval_runtime': 3.0002, 'eval_samples_per_second': 38.998, 'eval_steps_per_second': 5.0, 'epoch': 14.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.5762879848480225, 'eval_rouge1': 15.384615384615385, 'eval_rouge2': 10.256410256410255, 'eval_rougeL': 15.384615384615385, 'eval_rougeLsum': 15.384615384615385, 'eval_gen_len': 3.52991452991453, 'eval_runtime': 2.8587, 'eval_samples_per_second': 40.928, 'eval_steps_per_second': 5.247, 'epoch': 15.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.5727871656417847, 'eval_rouge1': 15.384615384615385, 'eval_rouge2': 12.82051282051282, 'eval_rougeL': 15.384615384615385, 'eval_rougeLsum': 15.384615384615385, 'eval_gen_len': 3.58974358974359, 'eval_runtime': 2.671, 'eval_samples_per_second': 43.804, 'eval_steps_per_second': 5.616, 'epoch': 16.0}


 85%|████████▍ | 1001/1180 [04:35<00:38,  4.68it/s]

{'loss': 0.479, 'grad_norm': 2.594365358352661, 'learning_rate': 0.0, 'epoch': 16.95}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.5727871656417847, 'eval_rouge1': 15.384615384615385, 'eval_rouge2': 12.82051282051282, 'eval_rougeL': 15.384615384615385, 'eval_rougeLsum': 15.384615384615385, 'eval_gen_len': 3.58974358974359, 'eval_runtime': 2.533, 'eval_samples_per_second': 46.19, 'eval_steps_per_second': 5.922, 'epoch': 17.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.5727871656417847, 'eval_rouge1': 15.384615384615385, 'eval_rouge2': 12.82051282051282, 'eval_rougeL': 15.384615384615385, 'eval_rougeLsum': 15.384615384615385, 'eval_gen_len': 3.58974358974359, 'eval_runtime': 2.4717, 'eval_samples_per_second': 47.336, 'eval_steps_per_second': 6.069, 'epoch': 18.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.5727871656417847, 'eval_rouge1': 15.384615384615385, 'eval_rouge2': 12.82051282051282, 'eval_rougeL': 15.384615384615385, 'eval_rougeLsum': 15.384615384615385, 'eval_gen_len': 3.58974358974359, 'eval_runtime': 3.0171, 'eval_samples_per_second': 38.779, 'eval_steps_per_second': 4.972, 'epoch': 19.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.5727871656417847, 'eval_rouge1': 15.384615384615385, 'eval_rouge2': 12.82051282051282, 'eval_rougeL': 15.384615384615385, 'eval_rougeLsum': 15.384615384615385, 'eval_gen_len': 3.58974358974359, 'eval_runtime': 4.5846, 'eval_samples_per_second': 25.52, 'eval_steps_per_second': 3.272, 'epoch': 20.0}


100%|██████████| 1180/1180 [05:32<00:00,  3.55it/s]

{'train_runtime': 332.7569, 'train_samples_per_second': 28.069, 'train_steps_per_second': 3.546, 'train_loss': 1.511996388839463, 'epoch': 20.0}





TrainOutput(global_step=1180, training_loss=1.511996388839463, metrics={'train_runtime': 332.7569, 'train_samples_per_second': 28.069, 'train_steps_per_second': 3.546, 'total_flos': 1736217313935360.0, 'train_loss': 1.511996388839463, 'epoch': 20.0})

In [16]:
# Save the trained model
model.save_pretrained(constants.LLM_FINE_TUNED_SAVE_PATH.split('/')[-1])

In [17]:
# Save the tokenizer (vocab, special tokens, etc.)
tokenizer.save_pretrained(constants.LLM_FINE_TUNED_TOKENIZER_PATH.split('/')[-1])

('fine-tuned-tokenizer\\tokenizer_config.json',
 'fine-tuned-tokenizer\\special_tokens_map.json',
 'fine-tuned-tokenizer\\spiece.model',
 'fine-tuned-tokenizer\\added_tokens.json',
 'fine-tuned-tokenizer\\tokenizer.json')

In [18]:
%%time
# Try predictions on validation set for confirmation
predictions = trainer.predict(tokenized_datasets["eval"])

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  0%|          | 0/15 [00:00<?, ?it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
 13%|█▎        | 2/15 [00:00<00:01,  9.49it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
 20%|██        | 3/15 [00:00<00:01,  6.88it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
 27%|██▋       | 4/15 [00:00<00:01,  6.29it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should us

CPU times: total: 516 ms
Wall time: 3.14 s





In [19]:
# Convert tokens from data to text
def translate(tokens):
  my_list = tokenizer.convert_ids_to_tokens(tokens)
  new_list = [token for token in my_list if ('<' not in token)] # token != '' and token != ''
  new_string = ''.join(new_list)
  new_string = new_string.replace("▁", " ")
  # new_string = new_string.replace("_", " ")
  new_string = new_string.strip()
  return new_string
     

# Print sample predicted output
index = 31
print(tokenized_datasets["eval"]["input"][index])
print("Target product: ", tokenized_datasets["eval"]["output"][index])
print("Recommended product: ", translate(predictions.predictions[index]))

index = 22
print(tokenized_datasets["eval"]["input"][index])
print("Target product: ", tokenized_datasets["eval"]["output"][index])
print("Recommended product: ", translate(predictions.predictions[index]))
     


Given the following customer profile and search history, suggest the most relevant product from the provided list of candidates. Return only the name of the product that best matches the customer's needs based on their profile and search activity.
Customer Profile: 
Search History: Suitcase,Dumbbells,Running Shoes,Sunglasses,Headphones,Phone Case,Protein Powder,Camera,Cooking Tools,Cooking Tools,Travel Adapter,Wireless Earbuds,Microwave
Products for Recommendation: ['Smartphone', 'Laptop', 'Smartwatch', 'Wireless Earbuds', 'Tablet', 'Dumbbells', 'Treadmill', 'Yoga Mat', 'Resistance Bands', 'Protein Powder', 'Novel Book', 'Textbook', 'Cookbook', 'Biography', "Children's Book", 'Guitar', 'Piano', 'Drums', 'Violin', 'Headphones', 'Jacket', 'Sneakers', 'Watch', 'Handbag', 'Sunglasses', 'PlayStation 5', 'Xbox Series X', 'Gaming Chair', 'VR Headset', 'Game Controller', 'Sofa', 'Dining Table', 'Lamp', 'Coffee Table', 'Wall Art', 'Backpack', 'Luggage Set', 'Travel Pillow', 'Camera', 'Travel Ad

In [20]:
# Collect generated outputs and join with prompts and targets
model_generated = []
prompt_list = []
target_list = []

for i in range(len(predictions.predictions)):
  model_generated.append(translate(predictions.predictions[i]))

  prompt_list.append(dataset_eval['input'][i])
  target_list.append(dataset_eval['output'][i])
     

df_target_and_generated = pd.DataFrame()

df_target_and_generated['input'] = prompt_list
df_target_and_generated['target'] = target_list
df_target_and_generated['model_generated'] = model_generated

df_target_and_generated.to_csv('fine-tune-result.csv', index=False)

df_target_and_generated


Unnamed: 0,input,target,model_generated
0,Given the following customer profile and searc...,Guitar,Piano
1,Given the following customer profile and searc...,Violin,Baseball Glove
2,Given the following customer profile and searc...,Game Controller,Baseball Glove
3,Given the following customer profile and searc...,Treadmill,Baseball Glove
4,Given the following customer profile and searc...,Baseball Glove,Children's Book
...,...,...,...
112,Given the following customer profile and searc...,VR Headset,Baseball Glove
113,Given the following customer profile and searc...,Children's Book,Children's Book
114,Given the following customer profile and searc...,Soccer Cleats,Laptop
115,Given the following customer profile and searc...,Dumbbells,Treadmill
