# Part 2 - Fine Tune LLM
- Fine tune LLM on our synthetically generated dataset to generate domain names from descriptions.
- Do preliminary evaluation using various standard NLP metrics such as rouge, bleu, levenshtein, etc.
- Due to time and Kaggle storage constraints, we'll just run this script changing different hyperparameters by hand as opposed to running a for loop over multiple configs, which would lead to a huge amount of space being used that will exceed the output allowance.
- We will ultiamtely vary:
    - Batch Size
    - Learning Rate
    - Switch on/off LoRA and PEFT

In [1]:
VERSION = 'v2'

In [2]:
!pip install -q transformers torch peft rouge_score levenshtein

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m100.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.2 MB/s[0m 

In [3]:
import time
import pandas as pd
import json

In [4]:
# Load in data
df = pd.read_csv('/kaggle/input/domain-name-generator/data/domain_names_with_descriptions.csv')
df.head()

Unnamed: 0,business_description,domain_name,category
0,A mobile app that helps people find local farm...,freshfinds.app,
1,An online subscription service for eco-friendl...,greenbundle.com,
2,A platform that helps small businesses adverti...,smallbusiness.co,
3,A website that connects pet owners with local ...,petsitter.com,
4,A blog platform for creative writers to share ...,writeon.com,


In [5]:
# Create dataset object for training
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
import torch
from sklearn.model_selection import train_test_split

2025-07-29 19:03:14.538784: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753815794.709256      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753815794.762436      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
# Combine into prompt: "Business: {description} -> Domain: {domain_name}" for autoregressie model
df["text"] = df.apply(lambda row: f"Business: {row['business_description']} -> Domain: {row['domain_name']}", axis=1)

# Split data set and turn into data set objects
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df.to_csv('train.csv',index=False)
test_df.to_csv('test.csv',index=False)

train_dataset = Dataset.from_pandas(train_df[["text"]])
test_dataset = Dataset.from_pandas(test_df[["text"]])

# Create dict of both training and testing for easy access in Trainer
datasets = DatasetDict({"train": train_dataset, "test": test_dataset})

In [7]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

In [8]:
model_name_or_path = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"
#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct", 
    trust_remote_code=True,
    use_auth_token=hf_token)
tokenizer.pad_token = tokenizer.eos_token



tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [9]:
# Load base model and tokenize
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    #torch_dtype=torch.bfloat16,
    torch_dtype=torch.float16,
    device_map="auto" # put onto cuda automatically if available
)

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = datasets.map(tokenize, batched=True)

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [10]:
# Create LoRA (parameter-efficient fine-tuning) and wrap model in it
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Commenting out removes peft and does full fine tuning
#model = get_peft_model(model, peft_config)

In [11]:
# Create data collator to patch, dynamically pad, etc.
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [12]:
# Create training arguments, data-collator ()

training_args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    #fp16=False,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
    report_to="none"
)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [13]:
#trainer.train()

# Save final model
trainer.save_model(f"./fine-tuned-llama-domain-generator-{VERSION}")
tokenizer.save_pretrained(f"./fine-tuned-llama-domain-generator-{VERSION}")

('./fine-tuned-llama-domain-generator-v2/tokenizer_config.json',
 './fine-tuned-llama-domain-generator-v2/special_tokens_map.json',
 './fine-tuned-llama-domain-generator-v2/chat_template.jinja',
 './fine-tuned-llama-domain-generator-v2/tokenizer.json')

#### Checks Before Eval

In [14]:
# Generate a sample prompt from the test set
sample_prompt = datasets['test'][0]['text'].split("-> Domain:")[0].replace("Business:", "").strip()
sample_true_domain = datasets['test'][0]['text'].split("-> Domain:")[-1].strip()
sample_prompt

'A subscription box service for unique, handmade jewelry.'

In [15]:
model.device

device(type='cuda', index=0)

In [16]:
# Put into tokenizer
inputs = tokenizer(sample_prompt, return_tensors="pt").to(model.device)
print("Max token ID:", inputs["input_ids"].max().item())
print("Model vocab size:", model.config.vocab_size)

Max token ID: 128000
Model vocab size: 128256


In [17]:
# Check to make sure we're not having token id mis matches
print("pad_token_id:", tokenizer.pad_token_id)
print("eos_token_id:", tokenizer.eos_token_id)
print("vocab size:", model.config.vocab_size)

pad_token_id: 128009
eos_token_id: 128009
vocab size: 128256


In [18]:
'''
model_id = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct", 
    trust_remote_code=True,
    use_auth_token=hf_token)

prompt = "Business: dog walking service -> Domain:"
inputs = tokenizer(sample_prompt, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}

model.eval()
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=10,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
print(tokenizer.decode(outputs[0]))
'''

'\nmodel_id = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"\nmodel = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)\ntokenizer = AutoTokenizer.from_pretrained(\n    "meta-llama/Llama-3.2-1B-Instruct", \n    trust_remote_code=True,\n    use_auth_token=hf_token)\n\nprompt = "Business: dog walking service -> Domain:"\ninputs = tokenizer(sample_prompt, return_tensors="pt")\ninputs = {k: v.to(model.device) for k, v in inputs.items()}\n\nmodel.eval()\nwith torch.no_grad():\n    outputs = model.generate(\n        **inputs,\n        max_new_tokens=10,\n        do_sample=False,\n        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,\n        eos_token_id=tokenizer.eos_token_id,\n    )\nprint(tokenizer.decode(outputs[0]))\n'

## Evaluating model
- We do have some eval metrics above that computes "language model loss" which measures model's ability to predict the next token, not necessarily how relevant/quality the output is.
- Thus, we implement some scoring
    - ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate the quality of machine-generated summaries by comparing them to human-written reference summaries.
    - In general, a ROUGE score above 0.5 is often considered good, especially for ROUGE-1, while scores below 0.2 are often considered poor.
    - A good BLEU (bilingual evaluation understudy) score is above 0.3...however, this isn't the best use case for BLEU since its primarily used for tranlsation.
    - A good levenshtein distance is not a fixed thing, but 0 represents identical strings.

In [19]:
from rouge_score import rouge_scorer
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import Levenshtein
import re

In [20]:
# Create function to check if domain name is valid domain name
VALID_TLDS = ['.com', '.ai', '.io', '.net', '.org', '.co', '.app', '.tech', '.dev']

def is_valid_domain(domain: str) -> bool:
    domain = domain.strip().lower()

    # Basic structure check: must contain a valid TLD
    if not any(domain.endswith(tld) for tld in VALID_TLDS):
        return False

    # No whitespace
    if ' ' in domain:
        return False

    # Length check
    if len(domain) > 63 or len(domain) == 0:
        return False

    # Valid characters (alphanumeric, hyphen, and dot)
    pattern = r'^[a-z0-9\-\.]+$'
    if not re.match(pattern, domain):
        return False

    # Should not contain multiple dots (e.g., no sentences)
    if domain.count('.') > 1:
        return False

    return True

In [21]:
# get rouge scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
scores = []
bleu_scores = []
levenshtein_distances = []
brandability_scores = []
predictions = []
valid_domains = []

In [22]:
# Put pytorch model in eval mode
model.eval()

# Loop over test data
for item in test_dataset:
    input_text = item["text"]
    if "-> Domain:" not in input_text:
        continue
    description = input_text.split("-> Domain:")[0].replace("Business:", "").strip()
    true_domain = input_text.split("-> Domain:")[-1].strip()

    prompt = f"Business: {description} -> Domain:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=20, 
            do_sample=True, 
            top_k=50, 
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred_domain = decoded.split("-> Domain:")[-1].strip()

    # Safety filter
    banned_keywords = ["sex", "porn", "kill", "drugs", "hate", "murder"]
    if any(bad in pred_domain.lower() for bad in banned_keywords):
        pred_domain = "[REDACTED: Unsafe Output]"

    # Check if valid domain       
    valid_domain = is_valid_domain(pred_domain)
    valid_domains.append(valid_domain)
    
    # ROUGE
    rouge_score = scorer.score(true_domain, pred_domain)["rougeL"].fmeasure
    scores.append(rouge_score)

    # BLEU
    bleu = sentence_bleu([true_domain.split(".")], pred_domain.split("."), smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu)

    # Levenshtein
    lev_distance = Levenshtein.distance(true_domain, pred_domain)
    levenshtein_distances.append(lev_distance)

    # Brandability heuristic: short, no numbers/symbols, vowel-consonant balance
    def brandability(domain):
        score = 1.0
        if len(domain) > 15: score -= 0.3
        if any(char.isdigit() or not char.isalnum() for char in domain): score -= 0.3
        vowels = sum(1 for c in domain if c in "aeiou")
        consonants = sum(1 for c in domain if c.isalpha() and c not in "aeiou")
        ratio = vowels / (consonants + 1)
        if ratio < 0.2 or ratio > 0.8: score -= 0.2
        return max(0, round(score, 2))

    brandability_score = brandability(pred_domain)
    brandability_scores.append(brandability_score)

    predictions.append({
        "description": description,
        "true": true_domain,
        "pred": pred_domain,
        "rougeL": rouge_score,
        "bleu": bleu,
        "levenshtein": lev_distance,
        "brandability": brandability_score,
        "is_valid_domain": valid_domain,
    })

# Save predictions
results_df = pd.DataFrame(predictions)
results_df.to_csv(f"predictions_eval-{VERSION}.csv", index=False)

# Report average scores
print_string = f"Pct of Predictions that are valid domains = {results_df['is_valid_domain'].mean():.2%}"
print(print_string)

Pct of Predictions that are valid domains = 0.00%
