# Part 2 - Fine Tune LLM
- Fine tune LLM on our synthetically generated dataset to generate domain names from descriptions.

In [1]:
!pip install -q transformers torch peft rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m93.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m76.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.9 MB/s[0m et

In [2]:
import time
import pandas as pd
import json

In [3]:
# Load in data
df = pd.read_csv('/kaggle/input/domain-name-generator/data/domain_names_with_descriptions.csv')
df.head()

Unnamed: 0,business_description,domain_name
0,A mobile app that helps people find local farm...,freshfinds.app
1,An online subscription service for eco-friendl...,greenbundle.com
2,A platform that connects freelance graphic des...,designsbydesigners.com
3,"A subscription service that delivers fresh, or...",farm2table.com
4,A website that helps users find and book pet-f...,pawlovelovers.com


In [4]:
# Create dataset object for training
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
import torch
from sklearn.model_selection import train_test_split

2025-07-26 20:31:33.481672: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753561893.684008      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753561893.742682      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
# Combine into prompt: "Business: {description} -> Domain: {domain_name}" for autoregressie model
df["text"] = df.apply(lambda row: f"Business: {row['business_description']} -> Domain: {row['domain_name']}", axis=1)

# Split data set and turn into data set objects
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df.to_csv('train.csv',index=False)
test_df.to_csv('test.csv',index=False)

train_dataset = Dataset.from_pandas(train_df[["text"]])
test_dataset = Dataset.from_pandas(test_df[["text"]])

# Create dict of both training and testing for easy access in Trainer
datasets = DatasetDict({"train": train_dataset, "test": test_dataset})

In [6]:
model_name_or_path = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
# Load base model and tokenize
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto" # put onto cuda automatically if available
)

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = datasets.map(tokenize, batched=True)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [8]:
# Create LoRA (parameter-efficient fine-tuning) and wrap model in it
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)

In [9]:
# Create data collator to patch, dynamically pad, etc.
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [10]:
'''
# Create loop over hyperparameter space for a few basic params
#learning_rates = [5e-5, 2e-4, 1e-3]
learning_rates = [2e-4]
#batch_sizes = [2, 4]
batch_sizes = [4]

for lr in learning_rates:
    for bs in batch_sizes:
        training_args = TrainingArguments(
            output_dir=f"./outputs/lr{lr}_bs{bs}",
            per_device_train_batch_size=bs,
            num_train_epochs=1,
            learning_rate=lr,
            save_total_limit=2,
            save_strategy="epoch",
            logging_dir=f"./logs/lr{lr}_bs{bs}",
            report_to="none"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset["test"],
            data_collator=data_collator,
            tokenizer=tokenizer
        )

        # Run training and save model
        trainer.train()
        #trainer.save_model(f"./outputs/lr{lr}_bs{bs}/checkpoint")

        # Do versioning/logging and record some json metat data
        output_dir = f"./checkpoints/llama-lr{lr}-bs{bs}"
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)

        meta = {
            "model": "llama-3.2-1b-instruct",
            "lr": lr,
            "batch_size": bs,
            "description": "LoRA fine-tuned on augmented business data",
        }
        with open(f"{output_dir}/metadata.json", "w") as f:
            json.dump(meta, f, indent=4)
'''

'\n# Create loop over hyperparameter space for a few basic params\n#learning_rates = [5e-5, 2e-4, 1e-3]\nlearning_rates = [2e-4]\n#batch_sizes = [2, 4]\nbatch_sizes = [4]\n\nfor lr in learning_rates:\n    for bs in batch_sizes:\n        training_args = TrainingArguments(\n            output_dir=f"./outputs/lr{lr}_bs{bs}",\n            per_device_train_batch_size=bs,\n            num_train_epochs=1,\n            learning_rate=lr,\n            save_total_limit=2,\n            save_strategy="epoch",\n            logging_dir=f"./logs/lr{lr}_bs{bs}",\n            report_to="none"\n        )\n\n        trainer = Trainer(\n            model=model,\n            args=training_args,\n            train_dataset=tokenized_dataset["train"],\n            eval_dataset=tokenized_dataset["test"],\n            data_collator=data_collator,\n            tokenizer=tokenizer\n        )\n\n        # Run training and save model\n        trainer.train()\n        #trainer.save_model(f"./outputs/lr{lr}_bs{bs}/che

In [11]:
# Create training arguments, data-collator ()
training_args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
    report_to="none"
)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
trainer.train()

# Save final model
trainer.save_model("./fine-tuned-llama-domain-generator")
tokenizer.save_pretrained("./fine-tuned-llama-domain-generator")

Step,Training Loss


('./fine-tuned-llama-domain-generator/tokenizer_config.json',
 './fine-tuned-llama-domain-generator/special_tokens_map.json',
 './fine-tuned-llama-domain-generator/chat_template.jinja',
 './fine-tuned-llama-domain-generator/tokenizer.json')

## Evaluating model
- We do have some eval metrics above that computes "language model loss" which measures model's ability to predict the next token, not necessarily how relevant/quality the output is.
- Thus, we implement some rouge scoring
    - ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate the quality of machine-generated summaries by comparing them to human-written reference summaries. 

In [13]:
from rouge_score import rouge_scorer
import matplotlib.pyplot as plt

In [14]:
# get rouge scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

In [15]:
# Put pytorch model in eval mode
model.eval()

scores = []
predictions = []

# Loop over test data
for item in test_dataset:
    # Grab formatted text input to llama
    input_text = item["text"]
    if "-> Domain:" not in input_text:
        continue
    # Split the texts into the descriptions and the website names
    description = input_text.split("-> Domain:")[0].replace("Business:", "").strip()
    true_domain = input_text.split("-> Domain:")[-1].strip()
    # Now re format into a prompt without the website name and re tokenize
    prompt = f"Business: {description} -> Domain:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Have the model output stuff given 
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=20, do_sample=True, top_k=50, temperature=0.7)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred_domain = decoded.split("-> Domain:")[-1].strip()

    # Safety filter
    banned_keywords = ["sex", "porn", "kill", "drugs", "hate", "murder"]
    if any(bad in pred_domain.lower() for bad in banned_keywords):
        pred_domain = "[REDACTED: Unsafe Output]"

    score = scorer.score(true_domain, pred_domain)["rougeL"].fmeasure
    predictions.append({"description": description, "true": true_domain, "pred": pred_domain, "rougeL": score})
    scores.append(score)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [16]:
# Save predictions
results_df = pd.DataFrame(predictions)
results_df.to_csv("predictions_with_rouge.csv", index=False)

# Plot
plt.figure(figsize=(8, 4))
plt.hist(scores, bins=20, color="skyblue", edgecolor="black")
plt.title("ROUGE-L Score Distribution")
plt.xlabel("ROUGE-L F1 Score")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.savefig("rouge_score_histogram.png")
plt.close()

print(f"Average ROUGE-L score: {sum(scores)/len(scores):.4f}")

Average ROUGE-L score: 0.1333
