# Part 2 - Fine Tune LLM
- Fine tune LLM on our synthetically generated dataset to generate domain names from descriptions.

In [1]:
!pip install -q transformers torch peft

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import time
import pandas as pd

In [6]:
# Load in data
df = pd.read_csv('/kaggle/input/domain-name-generator/data/domain_names_with_descriptions.csv')
df.head()

Unnamed: 0,business_description,domain_name
0,A mobile app that helps people find local farm...,freshfinds.app
1,An online subscription service for eco-friendl...,greenbundle.com
2,A platform that connects freelance graphic des...,designsbydesigners.com
3,"A subscription service that delivers fresh, or...",farm2table.com
4,A website that helps users find and book pet-f...,pawlovelovers.com


In [4]:
# Create dataset object for training
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
import torch

2025-07-26 13:59:36.577099: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753538376.828451      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753538376.899594      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
# Combine into prompt: "Business: {description} -> Domain: {domain_name}" for autoregressie model
df["text"] = df.apply(lambda row: f"Business: {row['business_description']} -> Domain: {row['domain_name']}", axis=1)
dataset = Dataset.from_pandas(df[["text"]])

In [18]:
dataset['text']

['Business: A mobile app that helps people find local farmers markets. -> Domain: freshfinds.app',
 'Business: An online subscription service for eco-friendly cleaning supplies. -> Domain: greenbundle.com',
 'Business: A platform that connects freelance graphic designers with small business owners. -> Domain: designsbydesigners.com',
 'Business: A subscription service that delivers fresh, organic produce to your doorstep every week. -> Domain: farm2table.com',
 'Business: A website that helps users find and book pet-friendly accommodations while traveling. -> Domain: pawlovelovers.com']

In [19]:
model_name_or_path = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [20]:
# Load base model and tokenize
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto" # put onto cuda automatically if available
)

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [21]:
# Create LoRA (parameter-efficient fine-tuning) and wrap model in it
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)

In [None]:
# Create training arguments, data-collator ()
training_args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
    report_to="none"
)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
trainer.train()

# Save final model
trainer.save_model("./fine-tuned-llama-domain-generator")
tokenizer.save_pretrained("./fine-tuned-llama-domain-generator")