In [7]:
import json
import random
from pathlib import Path

# ---------- Config ----------
NUM_TRAIN = 1000
NUM_DEV = 200

TRAIN_PATH = Path("data/train_synth.jsonl")
DEV_PATH = Path("data/dev_synth.jsonl")

# ---------- Static data ----------

names = [
    "dhruv joshi", "pooja chatterjee", "aarav singh", "kavya mehra",
    "rohan sharma", "priya gupta", "sneha patil", "aditya verma",
    "shreya iyer", "manish agarwal", "rahul yadav", "ananya desai",
]

cities = [
    "mumbai", "delhi", "bangalore", "hyderabad", "pune",
    "kolkata", "chennai", "ahmedabad", "jaipur", "lucknow"
]

locations = [
    "powai", "andheri east", "whitefield", "banjara hills",
    "hitech city", "salt lake", "velachery", "indiranagar"
]

domains = [
    "gmail dot com",
    "outlook dot com",
    "hotmail dot com",
    "rediffmail dot com",
    "yahoo dot com"
]

digits_map = {
    "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four",
    "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine"
}

day_words = [
    "first", "second", "third", "fourth", "fifth", "sixth", "seventh",
    "eighth", "ninth", "tenth", "eleventh", "twelfth", "thirteenth",
    "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth",
    "nineteenth", "twentieth", "twenty first", "twenty second",
    "twenty third", "twenty fourth", "twenty fifth", "twenty sixth",
    "twenty seventh", "twenty eighth", "twenty ninth", "thirtieth",
    "thirty first"
]

months = [
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december"
]


# ---------- Helper functions ----------

def spell_number(n: int) -> str:
    """Spell each digit separately: 98215 -> 'nine eight two one five'."""
    return " ".join(digits_map[d] for d in str(n))


def make_phone_spelled() -> str:
    num = random.randint(7000000000, 9999999999)
    # Sometimes fully spelled, sometimes partially numeric like "98215 70866"
    mode = random.choice(["spelled", "numeric", "mixed"])
    if mode == "spelled":
        return spell_number(num)
    elif mode == "numeric":
        s = str(num)
        # randomly insert a space in the middle
        split_idx = random.randint(4, 6)
        return s[:split_idx] + " " + s[split_idx:]
    else:  # mixed
        s = str(num)
        return " ".join([s[:5], spell_number(int(s[5:]))])


def make_credit_card_spelled() -> str:
    # 16 digits credit card
    digits = "".join(str(random.randint(0, 9)) for _ in range(16))
    return spell_number(int(digits))


def make_email(name: str) -> str:
    """
    Convert 'pooja chatterjee' -> 'pooja dot chatterjee at gmail dot com'
    """
    username = name.replace(" ", " dot ")
    domain = random.choice(domains)
    return f"{username} at {domain}"


def make_date() -> str:
    """
    Either numeric '14/07/2026' or spelled 'fourteenth of july twenty twenty six'
    """
    mode = random.choice(["numeric", "spelled"])
    day = random.randint(1, 28)
    month_idx = random.randint(1, 12)
    year = random.randint(2023, 2028)

    if mode == "numeric":
        return f"{day:02d}/{month_idx:02d}/{year}"
    else:
        day_str = day_words[day - 1]
        month_str = months[month_idx - 1]
        # year as 'twenty twenty five' etc. simplify:
        year_prefix = "twenty"
        year_suffix = str(year % 100)
        if len(year_suffix) == 1:
            year_suffix = "0" + year_suffix
        # e.g. "twenty twenty five"
        year_spelled = " ".join([year_prefix, spell_number(int(year_suffix))])
        return f"{day_str} of {month_str} {year_spelled}"


# ---------- Template-based utterance builder ----------

def build_utterance_with_entities():
    """
    Build a single (text, entities) pair using STT-like templates.
    This function ensures correct character offsets by appending segments
    sequentially and tracking positions.
    """
    name = random.choice(names)
    city = random.choice(cities)
    loc = random.choice(locations)
    phone = make_phone_spelled()
    email = make_email(name)
    cc = make_credit_card_spelled()
    date = make_date()

    text = ""
    entities = []

    def add(seg: str, label: str = None):
        nonlocal text, entities
        # ensure single space separation between segments, no punctuation
        if text:
            text += " "
        start = len(text)
        text += seg
        end = len(text)
        if label is not None:
            entities.append({"start": start, "end": end, "label": label})

    # Randomly choose a template
    template_type = random.choice([1, 2, 3, 4, 5])

    if template_type == 1:
        # name + phone
        add("this is")
        add(name, "PERSON_NAME")
        add("my phone number is")
        add(phone, "PHONE")
        add("please call me tomorrow")

    elif template_type == 2:
        # name + city + phone + email + date
        add("this is")
        add(name, "PERSON_NAME")
        add("from")
        add(city, "CITY")
        add("my phone is")
        add(phone, "PHONE")
        add("and email is")
        add(email, "EMAIL")
        add("we can meet on")
        add(date, "DATE")

    elif template_type == 3:
        # name + location + credit card
        add("hey this is")
        add(name, "PERSON_NAME")
        add("i am calling from")
        add(loc, "LOCATION")
        add("my credit card number is")
        add(cc, "CREDIT_CARD")
        add("please do not share this with anyone")

    elif template_type == 4:
        # phone + date only
        add("my phone number is")
        add(phone, "PHONE")
        add("you can call me on")
        add(date, "DATE")

    else:
        # name + email only or name + email + city
        add("hello this is")
        add(name, "PERSON_NAME")
        add("my email id is")
        add(email, "EMAIL")
        if random.random() < 0.6:
            add("i am currently in")
            add(city, "CITY")

    # Sometimes add a trailing phrase
    if random.random() < 0.3:
        add("let me know if you got these details")

    return text, entities


def make_example(example_id: int):
    """
    Build one JSONL example with ID pattern 'utt_XXXXX'.
    """
    text, entities = build_utterance_with_entities()
    ex = {
        "id": f"utt_{example_id:05d}",
        "text": text,
        "entities": entities
    }
    return ex


# ---------- Main generation ----------

def generate_file(path: Path, start_id: int, num_examples: int):
    with path.open("w", encoding="utf-8") as f:
        for i in range(num_examples):
            ex_id = start_id + i
            example = make_example(ex_id)
            line = json.dumps(example, ensure_ascii=False)
            f.write(line + "\n")


def main():
    # Train: utt_00001 ... utt_01000
    generate_file(TRAIN_PATH, start_id=1, num_examples=NUM_TRAIN)
    # Dev: utt_01001 ... utt_01200
    generate_file(DEV_PATH, start_id=NUM_TRAIN + 1, num_examples=NUM_DEV)
    print(f"Written {NUM_TRAIN} train examples to {TRAIN_PATH}")
    print(f"Written {NUM_DEV} dev examples to {DEV_PATH}")


if __name__ == "__main__":
    main()


Written 1000 train examples to data\train_synth.jsonl
Written 200 dev examples to data\dev_synth.jsonl


In [11]:
!type train_synth.jsonl >> data/train.jsonl

In [10]:
!type dev_synth.jsonl >> data/dev.jsonl