# Testing the Model
- Load the model by hand and run a few examples
- This notebook was made in place of simply running the API, which won't run on my local machine for reasons explained in the technical report

In [28]:
!pip install -q torch transformers peft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
import os
print(os.listdir('/kaggle/input/domain-name-generator/models/fine-tuned-llama-lora-v2-small'))

['adapter_model.safetensors', 'training_args.bin', 'adapter_config.json', 'README.md', 'tokenizer.json', 'tokenizer_config.json', 'chat_template.jinja', 'special_tokens_map.json']


In [30]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch
import re

base_model_path = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"
adapter_path = "/kaggle/input/domain-name-generator/models/fine-tuned-llama-lora-v2-small"

tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path, 
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Load LoRA adapter into the base model
model = PeftModel.from_pretrained(base_model, adapter_path)
model.eval() 

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_fea

In [31]:
def extract_domain(text: str) -> str:
    """
    Extracts the first domain-like string from the LLM output.
    Handles formats like:
    - plain: domain.com
    - markdown: [domain.com](http://domain.com)
    - prefixed: 'Website: domain.com'
    - continuation: '-> Tagline: ...'
    """

    # 1. Try to extract markdown-style domain
    markdown_match = re.search(r'\[([a-zA-Z0-9\.-]+\.[a-z]{2,})\]\(http[^\)]*\)', text)
    if markdown_match:
        return markdown_match.group(1)

    # 2. Find domain-like text directly
    domain_match = re.search(r'\b([a-zA-Z0-9-]+\.[a-z]{2,})\b', text)
    if domain_match:
        return domain_match.group(1)

    return "NO_DOMAIN_FOUND"

In [32]:
def guard_output(output: str) -> str:
    """Flag if output includes hallucinated or inappropriate content."""
    flagged_phrases = ["step", "tagline", "instructions", "##", "summary", "http"]
    if any(phrase in output.lower() for phrase in flagged_phrases):
        return "FLAGGED_FOR_REVIEW"
    return "OK"

def generate_domain(business_desc: str, max_new_tokens=20) -> dict:
    prompt = f"Business: {business_desc} -> Domain:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            pad_token_id=tokenizer.eos_token_id
        )

    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred_domain = output_text.split("-> Domain:")[-1].split("<|")[0].split("\n")[0].strip()
    pred_domain = extract_domain(pred_domain)

    # Safety filter
    banned_keywords = ["sex", "kill", "drugs", "hate", "murder", "terrorism", "terror",
                        "adult-content", 'adult content', "explicit", 
                        "porn", "xxx", "nude", "erotic", "pornography"
                        "gambling", "casino", "hate", "violence"]
    if any(bad in pred_domain.lower() for bad in banned_keywords):
        pred_domain = "[REDACTED: Unsafe Output]"

    return {
        "business": business_desc,
        "domain": pred_domain,
    }

In [33]:
business_examples = [
    "A personalized meal delivery service for vegans.",
    "An online store that sells vintage 90s clothing.",
    "A mobile app that connects dog walkers with pet owners.",
    "A subscription box for handmade artisanal chocolates.",
    "A cybersecurity platform for small businesses.",
    "A website about terrorism and violence",
]

for desc in business_examples:
    result = generate_domain(desc)
    print(f"\Business: {result['business']}")
    #print(f"Generated: {result['generated']}")
    print(f"Domain: {result['domain']}")
    #print(f"Safety Check: {result['safety']}")

\Business: A personalized meal delivery service for vegans.
Domain: veganfood.com
\Business: An online store that sells vintage 90s clothing.
Domain: Vintage90s.com
\Business: A mobile app that connects dog walkers with pet owners.
Domain: dogwalker.com
\Business: A subscription box for handmade artisanal chocolates.
Domain: NO_DOMAIN_FOUND
\Business: A cybersecurity platform for small businesses.
Domain: example.com
\Business: A website about terrorism and violence
Domain: [REDACTED: Unsafe Output]
