# Testing the Model
- Load the model by hand and run a few examples
- This notebook was made in place of simply running the API, which won't run on my local machine for reasons explained in the technical report

In [1]:
VERSION = 'v2'
FLAG = 'large'

In [2]:
!pip install -q torch transformers peft

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch
import re
import os

base_model_path = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"
adapter_path = f"/kaggle/input/domain-name-generator/models/fine-tuned-llama-lora-{VERSION}-{FLAG}"

tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path, 
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Load LoRA adapter into the base model
model = PeftModel.from_pretrained(base_model, adapter_path)
model.eval() 

2025-07-30 18:28:28.435842: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753900108.764153      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753900108.828022      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_fea

In [4]:
def extract_domain(text: str) -> str:
    """
    Extracts the first domain-like string from the LLM output.
    Handles formats like:
    - plain: domain.com
    - markdown: [domain.com](http://domain.com)
    - prefixed: 'Website: domain.com'
    - continuation: '-> Tagline: ...'
    """

    # 1. Try to extract markdown-style domain
    markdown_match = re.search(r'\[([a-zA-Z0-9\.-]+\.[a-z]{2,})\]\(http[^\)]*\)', text)
    if markdown_match:
        return markdown_match.group(1)

    # 2. Find domain-like text directly
    domain_match = re.search(r'\b([a-zA-Z0-9-]+\.[a-z]{2,})\b', text)
    if domain_match:
        return domain_match.group(1)

    return "NO_DOMAIN_FOUND"

In [5]:
def guard_output(output: str) -> str:
    """Flag if output includes hallucinated or inappropriate content."""
    flagged_phrases = ["step", "tagline", "instructions", "##", "summary", "http"]
    if any(phrase in output.lower() for phrase in flagged_phrases):
        return "FLAGGED_FOR_REVIEW"
    return "OK"

def generate_domain(business_desc: str, max_new_tokens=20) -> dict:
    prompt = f"Business: {business_desc} -> Domain:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            pad_token_id=tokenizer.eos_token_id
        )

    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred_domain = output_text.split("-> Domain:")[-1].split("<|")[0].split("\n")[0].strip()
    pred_domain = extract_domain(pred_domain)

    # Safety filter
    banned_keywords = ["sex", "kill", "drugs", "hate", "murder", "terrorism", "terror",
                        "adult-content", 'adult content', "explicit", 
                        "porn", "xxx", "nude", "erotic", "pornography"
                        "gambling", "casino", "hate", "violence"]
    if any(bad in pred_domain.lower() for bad in banned_keywords):
        pred_domain = "[REDACTED: Unsafe Output]"

    return {
        "business": business_desc,
        "domain": pred_domain,
    }

In [6]:
business_examples = [
    "A personalized meal delivery service for vegans.",
    "An online store that sells vintage 90s clothing.",
    "A mobile app that connects dog walkers with pet owners.",
    "An inappropriate website that people shouldn't visit.",
    "A cybersecurity platform for small businesses.",
    "A website about terrorism and violence",
]

for desc in business_examples:
    result = generate_domain(desc)
    print(f"\Business: {result['business']}")
    #print(f"Generated: {result['generated']}")
    print(f"Domain: {result['domain']}")
    #print(f"Safety Check: {result['safety']}")

\Business: A personalized meal delivery service for vegans.
Domain: veganfoodie.com
\Business: An online store that sells vintage 90s clothing.
Domain: vintage90s.com
\Business: A mobile app that connects dog walkers with pet owners.
Domain: dogwalkers.com
\Business: An inappropriate website that people shouldn't visit.
Domain: www.mysite
\Business: A cybersecurity platform for small businesses.
Domain: cybersecurityhub.com
\Business: A website about terrorism and violence
Domain: [REDACTED: Unsafe Output]
