### Fine-tuning

In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
def prepare_data(csv_path):
    df = pd.read_csv(csv_path)
    
    formatted_data = []
    for _, row in df.iterrows():
        instruction = "Generate domain name suggestions for the following business."
        input_text = f"Business description: {row['business_description']}"
        output_text = f"Suggested domains: {row['suggested_domains']}"
        
        formatted_data.append({
            "text": f"<s>[INST] {instruction}\n\n{input_text} [/INST] {output_text}</s>"
        })
    
    dataset = Dataset.from_pandas(pd.DataFrame(formatted_data))
    return dataset

In [None]:
def tokenize_function(examples, tokenizer):
    max_length = 128
    return tokenizer(
        examples["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=max_length, 
        return_tensors="pt"
    )

In [None]:
def setup_and_train(dataset, output_dir="./finetuned_mistral_domain_generator"):
    train_test_split = dataset.train_test_split(test_size=0.2)
    train_dataset = train_test_split["train"]
    eval_dataset = train_test_split["test"]
    
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
    tokenizer.pad_token = tokenizer.eos_token
    
    train_tokenized = train_dataset.map(
        lambda x: tokenize_function(x, tokenizer), 
        batched=True, 
        batch_size=8
    )
    eval_tokenized = eval_dataset.map(
        lambda x: tokenize_function(x, tokenizer), 
        batched=True, 
        batch_size=8
    )
    
    compute_dtype = getattr(torch, "float16")
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True, 
        llm_int8_enable_fp32_cpu_offload=True 
    )

    max_memory = {0: "8GiB", "cpu": "16GiB"}
    
    model = AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-Instruct-v0.3",
        quantization_config=bnb_config,
        device_map="auto",
        max_memory=max_memory,
        low_cpu_mem_usage=True,
        torch_dtype=compute_dtype,
    )
    
    model.config.use_cache = False 
    
    model = prepare_model_for_kbit_training(model)
    
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    model = get_peft_model(model, lora_config)
    
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable parameters: {trainable_params}")
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,
        eval_strategy="steps",
        eval_steps=200,
        logging_steps=50,
        gradient_checkpointing=True,
        num_train_epochs=2,
        weight_decay=0.01,
        warmup_steps=50,
        lr_scheduler_type="cosine",
        learning_rate=1e-4,
        save_steps=400,
        fp16=True,
        push_to_hub=False,
        save_total_limit=1,
        ddp_find_unused_parameters=False,
        optim="adamw_torch",
        report_to="none",
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=eval_tokenized,
        data_collator=data_collator,
    )
    
    trainer.train()
    
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return model, tokenizer

In [4]:
def generate_domain_suggestions(model, tokenizer, business_description, max_length=100):
    instruction = "Generate domain name suggestions for the following business."
    input_text = f"Business description: {business_description}"
    
    prompt = f"<s>[INST] {instruction}\n\n{input_text} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    if torch.cuda.is_available():
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "[/INST]" in generated_text:
        response = generated_text.split("[/INST]")[1].strip()
    else:
        response = generated_text
    
    return response

In [None]:
def train_in_chunks(csv_path, chunk_size=50, output_dir="./finetuned_mistral_domain_generator"):
    df = pd.read_csv(csv_path)
    
    num_chunks = max(1, len(df) // chunk_size)
    print(f"Training in {num_chunks} chunks with {chunk_size} examples per chunk")
    
    model = None
    tokenizer = None
    
    for i in range(num_chunks):
        print(f"\n--- Processing chunk {i+1}/{num_chunks} ---")
        
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, len(df))
        chunk_df = df.iloc[start_idx:end_idx].copy()
        
        temp_csv = f"temp_chunk_{i}.csv"
        chunk_df.to_csv(temp_csv, index=False)
        
        dataset = prepare_data(temp_csv)
        
        model, tokenizer = setup_and_train(dataset, output_dir=f"{output_dir}_chunk_{i}")
        
        if os.path.exists(temp_csv):
            os.remove(temp_csv)
            
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    if model is not None and tokenizer is not None:
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
    
    return model, tokenizer

In [None]:
if __name__ == "__main__":
    csv_path = "data/business_descriptions.csv"
    
    use_chunked_training = True
    
    if use_chunked_training:
        print("Using chunked training to minimize memory usage...")
        model, tokenizer = train_in_chunks(csv_path, chunk_size=500)
    else:
        dataset = prepare_data(csv_path)
        print(f"Dataset created with {len(dataset)} examples")
        
        print("Starting fine-tuning...")
        model, tokenizer = setup_and_train(dataset)
    
    print("Fine-tuning completed!")
    
    test_description = "A business that specializes in handmade pottery."
    suggestions = generate_domain_suggestions(model, tokenizer, test_description)
    print(f"Generated domain suggestions for '{test_description}':")
    print(suggestions)

Using chunked training to minimize memory usage...
Training in 2 chunks with 500 examples per chunk

--- Processing chunk 1/2 ---


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable parameters: 3407872


Step,Training Loss,Validation Loss
200,0.932,0.950343



--- Processing chunk 2/2 ---


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable parameters: 3407872


Step,Training Loss,Validation Loss
200,0.9388,0.936948


Fine-tuning completed!
Generated domain suggestions for 'A business that specializes in handmade pottery.':
Generate domain name suggestions for the following business.

Business description: A business that specializes in handmade pottery.  Suggested domains: handmade.co, handmadepottery.com, handmadepottery.biz, handmadepottery.org, handmadepottery.shop, handmade.shop, handmadepotteryworld.co, handmade.net, handmadepottery.online, handmade.io, handmade.net, handmadepotterycentral.com, handmadepottery.io, handmade


In [8]:
test_description = "A business that specializes in baking pies."
suggestions = generate_domain_suggestions(model, tokenizer, test_description, 100)
print(f"Generated domain suggestions for : '{test_description}'")
print(suggestions)

Generated domain suggestions for : 'A business that specializes in baking pies.'
Generate domain name suggestions for the following business.

Business description: A business that specializes in baking pies.  Suggested domains: pie.online, piesbaking.com, pieexpert.online, pie.biz, pie.io, pies.biz, piehub.site, piecentral.online, pie.co, pies.shop, pie.com, piehub.tech, pieexpert.io, piebaking.tech, piebaking.online, piehub.net, pie.co, pie.tech, pies.
