In [None]:
# using python 3.9
#!pip install kagglehub==0.3.13 pandas==2.0.3 openai==2.2.0 tqdm==4.66.4

In [None]:
import kagglehub
import os
import pandas as pd
from tqdm import tqdm
from tqdm.asyncio import tqdm as async_tqdm
from openai import OpenAI, AsyncOpenAI
import random
import asyncio
import json

## Intro

In this notebook, we will create our own finetuning dataset of (`business_description`, `domain_name`).

- We will start of with an open-source dataset which lists 7M global business. We will sample this datast to get initial list of `domain_name`.

- Then, we will use an LLM to filter the initial sampled dataset for quality examples only

- As a final step, we will use AI web search tools to fetch business descriptions from our domain names

This whole process is fully automated

### Initial open source company dataset

In [None]:
path = kagglehub.dataset_download("peopledatalabssf/free-7-million-company-dataset")

In [6]:
df = pd.read_csv(os.path.join(path, 'companies_sorted.csv'))

In [7]:
df.shape

(7173426, 11)

In [8]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,name,domain,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate
0,5872184,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,716906
1,4425416,tata consultancy services,tcs.com,1968.0,information technology and services,10001+,"bombay, maharashtra, india",india,linkedin.com/company/tata-consultancy-services,190771,341369
2,21074,accenture,accenture.com,1989.0,information technology and services,10001+,"dublin, dublin, ireland",ireland,linkedin.com/company/accenture,190689,455768


### Filtering and sampling initial data

Our filtering is simple - take companies which have an existing domain name in english, and are sufficiently big (atleast 10 emplyees) to increase the likelihood of AI web search finding correct business descriptions about these companies

In [9]:
def filter_dataframe(df):
    allowed_country = set(['united sates', 'united kingdom', 'canada', 'australia'])
    disallowed_size_range = set(['1 - 10'])
    df = df.dropna(axis=0, subset=['domain', 'industry'])
    df = df[df['country'].isin(allowed_country) & ~df['size range'].isin(disallowed_size_range)]
    return df

In [10]:
df = filter_dataframe(df)

In [11]:
df.shape

(140166, 11)

Next, group by industry and sample some companies from each for a balanced initial dataset

In [12]:
sample = df.groupby('industry').sample(50, random_state=42, replace=True)

In [13]:
sample.shape

(7350, 11)

In [14]:
# a few examples
for i in range(0, 1000, 100):
    print(f'Domain: {sample.iloc[i]["domain"]}, Industry: {sample.iloc[i]["industry"]}')

Domain: krestongta.com, Industry: accounting
Domain: odsc.vic.gov.au, Industry: alternative dispute resolution
Domain: noghost.co.uk, Industry: animation
Domain: woodshardwick.com, Industry: architecture & planning
Domain: continental-automotive.com, Industry: automotive
Domain: tandia.com, Industry: banking
Domain: jumpstudios.tv, Industry: broadcast media
Domain: sharpsav.com, Industry: business supplies and equipment
Domain: pelgar.co.uk, Industry: chemicals
Domain: ejtaylor.co.uk, Industry: civil engineering


### Filtering out bad quality examples

For this experiment, I set out a couple of criteria for a finetuning dataset - I want domain names to reflect the industry of the business. For example: a company about farming should include an industry specific keyword in its domain: harvesting, farming, crops, etc. Another criteria - only whole, clean keywords should be used in the domain name, no abbreviations, obscure terms, names, locations etc. It should be clear. Domain names like: name-surname-clinic.com, abc-cuts.com should not be allowed. 

To achieve this I will use an llm. I used gpt-5-mini here, but any llm could be used. I will supply a prompt with domain name and industry. The llm is tasked with extracting relevant industry keyword from the domain, as well as classifying whether the domain name uses appropriate vocabulary. If both checks pass, I will use that example in the next dataset creation stage.

In [15]:
CLASSIFY_DOMAIN_RELEVANCE_PROMPT = """You work in a web domain creation service. You will be given a domain name and an industry the business is based in. Your task is to determine whether the following domain name is acceptable based on the following criteria:

- If a domain describes the industry, it will contain an industry related term. Extract that term if it exists. 
- Domain should use generic vocabulary or keywords only. Exclude domain names that contain abbreviations, names, locations, etc.

Inputs:
Domain: {domain}
Industry: {industry}

Output format:
{{
    "industry_term": "extracted term or blank",
    "uses_generic_vocabulary": "yes/no"
}}
"""

In [None]:
# number of starting examples we will use, change to whatever value is needed
NUM_EXAMPLES = 2000
random.seed(42)
random_idx =  random.sample(range(len(sample)), NUM_EXAMPLES)

In [17]:
client = AsyncOpenAI(api_key=os.environ['OPENAI_API_KEY'])

In [18]:
async def generate(prompt, **prompt_kwargs):
    response = await client.responses.create(
        model="gpt-5-mini", # can replace with another openai-compatible model
        input=prompt.format_map(prompt_kwargs)
    )
    return response.output_text

In [20]:
# use asyncio to process quicker
NUM_CONCURRENT = 10
semaphore = asyncio.Semaphore(NUM_CONCURRENT)

async def generate_concurrent(semaphore, prompt, **prompt_kwargs):
    async with semaphore:
        return await generate(prompt, **prompt_kwargs)

tasks = []

for idx in random_idx:
    row = sample.iloc[idx]
    domain = row['domain']
    industry = row['industry']
    task = generate_concurrent(semaphore, CLASSIFY_DOMAIN_RELEVANCE_PROMPT, domain=domain, industry=industry)
    tasks.append(task)

responses = await async_tqdm.gather(*tasks)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [18:50<00:00,  1.77it/s]


In [38]:
def filter_responses(responses: list[str]):
    filtered_idx = []

    for i, response_str in enumerate(responses):
        try:
            response = json.loads(response_str)
            if response['industry_term'] and response['uses_generic_vocabulary'].lower().strip() == 'yes':
                filtered_idx.append(i)
        except:
            pass

    return filtered_idx

filtered_idx = [random_idx[idx] for idx in filter_responses(responses)]

sample_filtered = sample.iloc[filtered_idx]

In [40]:
sample_filtered.shape

(383, 11)

In [41]:
# some examples
for i in range(0, 200, 20):
    print(f'Domain: {sample_filtered.iloc[i]["domain"]}, Industry: {sample_filtered.iloc[i]["industry"]}')

Domain: coffeetime.ca, Industry: food & beverages
Domain: mediawise.eu, Industry: public relations and communications
Domain: bottle-shop.co.uk, Industry: retail
Domain: circulogene.com, Industry: biotechnology
Domain: dairypartners.co.uk, Industry: dairy
Domain: idealmagazine.co.uk, Industry: writing and editing
Domain: new-covenant.ca, Industry: religious institutions
Domain: idolight.it, Industry: design
Domain: legion.ca, Industry: military
Domain: premierenergy.co.uk, Industry: utilities


### Getting business descriptions

In [43]:
BUSINESS_DESCRIPTION_RETRIEVAL_PROMPT = """You will be given a real existing domain name of some brand. Your task is to find what that brand is about and generate a short business description. 

Output instruction:
- Do not mention the brand name or domain itself. It should be generated as if someone could come up with the brand domain after reading the description. 
- Output plain text. DO NOT INSERT references or any urls.
- Keep it concise, up to 30 words.

If you are unable to find anything, output "not found"

Brand domain: {brand_domain}
"""

In [51]:
async def generate_web_search(prompt, **prompt_kwargs):
    response = await client.responses.create(
        model="gpt-5-mini",
        tools=[{"type": "web_search"}],
        input=prompt.format_map(prompt_kwargs),
        reasoning={"effort": "low"},
    )
    return response.output_text

In [52]:
# as before use asyncio to process faster
NUM_CONCURRENT = 10
semaphore = asyncio.Semaphore(NUM_CONCURRENT)

async def generate_concurrent_web(semaphore, prompt, **prompt_kwargs):
    async with semaphore:
        return await generate_web_search(prompt, **prompt_kwargs)

tasks = [generate_concurrent_web(semaphore, BUSINESS_DESCRIPTION_RETRIEVAL_PROMPT, brand_domain=domain_name) 
         for domain_name in sample_filtered['domain'].tolist()]

responses = await async_tqdm.gather(*tasks)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 383/383 [04:58<00:00,  1.29it/s]


In [58]:
invalid_resp_idx = [i for i, resp in enumerate(responses) if 'not found' in resp.lower()]

In [59]:
len(invalid_resp_idx)

15

In [60]:
responses[invalid_resp_idx[0]]

'not found'

#### Combining results into final dataset

In [61]:
dataset_items = []

domains = sample_filtered['domain'].tolist()

assert len(domains) == len(responses)

for i, (domain, business_description) in enumerate(zip(domains, responses)):
    if i in invalid_resp_idx:
        continue
    dataset_items.append({'domain': domain, 'business_description': business_description})

In [62]:
len(dataset_items)

368

In [64]:
dataset_items[:5]

[{'domain': 'coffeetime.ca',
  'business_description': 'Canadian coffee-and-snack chain serving brewed coffee, specialty drinks, fresh baked goods, all-day breakfast, sandwiches and soups, with franchising opportunities across Canada and select international locations.'},
 {'domain': 'jeuxspin.com',
  'business_description': 'Rental and event company providing giant games, team-building activities, virtual experiences and branded game activations for corporate, school and public events across Quebec/Canada.'},
 {'domain': 'christiansurfers.net',
  'business_description': 'International Christian surf ministry: faith-centered chapters connecting surfers through outreach, camps, events, discipleship, chaplaincy and community service to share Jesus within global surf culture.'},
 {'domain': 'axisanimation.com',
  'business_description': 'Award-winning Glasgow-based animation and VFX studio creating cinematic CG, trailers, and character-driven content for video games, film, television, and

Converting to chat format

In [74]:
dataset = []

for item in dataset_items:
    dataset.append({'messages': [{'role': 'user', 'content': item['business_description']}, {'role': 'assistant', 'content': item['domain']}]})

In [75]:
with open('dataset.jsonl', 'w') as f:
    for example in dataset:
        f.write(json.dumps(example, ensure_ascii=False)+'\n')