In [5]:
import os
import json
import glob
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from collections import Counter

BASE_DIR = "dataset/"

In [None]:
def iter_all_json_files():
    patterns = [
        os.path.join(BASE_DIR, "*", "train_*_full.json"),
        os.path.join(BASE_DIR, "*", "validate_*.json"),
    ]
    paths = []
    for p in patterns:
        paths.extend(glob.glob(p))
    return paths

def load_examples(path):
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        content = json.loads(f.read())

    if isinstance(content, list):
        return content
    if isinstance(content, dict) and "utterances" in content:
        return content["utterances"]
    if isinstance(content, dict) and len(content) == 1:
        val = next(iter(content.values()))
        if isinstance(val, list):
            return val

    raise ValueError(f"Unexpected JSON: {path}")

# Count slot frequencies
slot_counter = Counter()

for path in iter_all_json_files():
    examples = load_examples(path)
    for ex in examples:
        for seg in ex["data"]:
            if "entity" in seg:
                slot_counter[seg["entity"]] += 1

# Sort by frequency
sorted_slots = sorted(slot_counter.items(), key=lambda x: x[1], reverse=True)

print(f"Total unique slot types: {len(sorted_slots)}\n")

print("=== Slot Frequency Ranking ===")
for slot, count in sorted_slots:
    print(f"{slot:25s} {count}")

Total unique slot types: 39

=== Slot Frequency Ranking ===
object_type               3341
object_name               3087
playlist                  2201
timeRange                 2096
rating_value              2057
artist                    2020
music_item                1767
city                      1435
restaurant_type           1426
spatial_relation          1292
rating_unit               1182
playlist_owner            1167
best_rating               1101
party_size_number         1082
state                     1063
object_select             1031
country                   900
movie_name                861
service                   801
movie_type                723
year                      660
location_name             619
entity_name               612
sort                      576
condition_temperature     497
object_location_type      492
condition_description     476
restaurant_name           359
party_size_description    330
object_part_of_series_type 324
geographic_poi         

In [None]:
# Iterate SNIPS train/test JSON files

def iter_snips_files(split: str):
    if split == "train":
        pattern = os.path.join(BASE_DIR, "*", "train_*_full.json")
    elif split == "test":
        pattern = os.path.join(BASE_DIR, "*", "validate_*.json")
    else:
        raise ValueError("split must be 'train' or 'test'")

    for path in glob.glob(pattern):
        filename = os.path.basename(path)
        name_part = filename.split("_", 1)[1]
        intent = (
            name_part.replace("_full.json", "").replace(".json", "")
        )
        yield intent, path

In [None]:
# Load a SNIPS JSON file

def load_examples(path):
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        text = f.read()
    content = json.loads(text)

    if isinstance(content, list):
        return content
    if isinstance(content, dict) and "utterances" in content:
        return content["utterances"]
    if isinstance(content, dict) and len(content) == 1:
        only_value = next(iter(content.values()))
        if isinstance(only_value, list):
            return only_value

    raise ValueError(f"Unexpected JSON format: {path}")

In [9]:
# Count slot frequencies
slot_counter = Counter()

for path in iter_all_json_files():
    examples = load_examples(path)
    for ex in examples:
        for seg in ex["data"]:
            if "entity" in seg:
                slot_counter[seg["entity"]] += 1

# Sort by frequency (descending)
sorted_slots = sorted(slot_counter.items(), key=lambda x: x[1], reverse=True)

print(f"Total unique slot types: {len(sorted_slots)}\n")

print("=== Slot Frequency Ranking ===")
for slot, count in sorted_slots:
    print(f"{slot:25s} {count}")

Total unique slot types: 39

=== Slot Frequency Ranking ===
object_type               3341
object_name               3087
playlist                  2201
timeRange                 2096
rating_value              2057
artist                    2020
music_item                1767
city                      1435
restaurant_type           1426
spatial_relation          1292
rating_unit               1182
playlist_owner            1167
best_rating               1101
party_size_number         1082
state                     1063
object_select             1031
country                   900
movie_name                861
service                   801
movie_type                723
year                      660
location_name             619
entity_name               612
sort                      576
condition_temperature     497
object_location_type      492
condition_description     476
restaurant_name           359
party_size_description    330
object_part_of_series_type 324
geographic_poi         

In [10]:
def example_to_token_rows_bio(intent, example, sentence_id):
    segments = example["data"]

    utterance = ""
    entity_spans = []
    cursor = 0

    # Build full utterance and record spans
    for seg in segments:
        text = seg["text"]
        if "entity" in seg:
            slot = seg["entity"]
            start = cursor
            end = cursor + len(text)
            entity_spans.append((start, end, slot))
        utterance += text
        cursor = len(utterance)

    # Whitespace tokenization with char spans
    word_tokens = []
    i = 0
    length = len(utterance)

    while i < length:
        if utterance[i].isspace():
            i += 1
            continue

        start = i
        while i < length and not utterance[i].isspace():
            i += 1
        end = i

        raw_token = utterance[start:end]
        clean_token = raw_token.strip(string.punctuation)

        if clean_token:
            word_tokens.append((clean_token, start, end))

    # Assign BIO tags
    rows = []
    for token_id, (token, start, end) in enumerate(word_tokens):

        # find overlapping entity spans
        matching = []
        for ent_start, ent_end, slot_name in entity_spans:
            if not (end <= ent_start or start >= ent_end):
                matching.append((ent_start, ent_end, slot_name))

        if not matching:
            tag = "O"
        else:
            ent_start, ent_end, slot_name = matching[0]

            span_indices = [
                idx for idx, (_, s, e) in enumerate(word_tokens)
                if not (e <= ent_start or s >= ent_end)
            ]

            if token_id == span_indices[0]:
                tag = f"B-{slot_name}"
            else:
                tag = f"I-{slot_name}"

        rows.append({
            "sentence_id": sentence_id,
            "token_id": token_id,
            "intent": intent,
            "utterance": utterance.strip(),
            "token": token,
            "slot": tag
        })

    return rows

In [None]:
# Build stratified train/val split and convert examples into word-level BIO rows

def build_train_val():
    meta_examples = []  # list of (intent, example, sentence_id)
    sentence_id = 0

    # Load all training examples first
    for intent, path in iter_snips_files("train"):
        examples = load_examples(path)
        for ex in examples:
            meta_examples.append((intent, ex, sentence_id))
            sentence_id += 1

    intents = [x[0] for x in meta_examples]

    # Stratified split
    train_meta, val_meta = train_test_split(
        meta_examples,
        test_size=0.1,
        stratify=intents,
        random_state=42
    )

    # Convert train split to word-level BIO rows
    train_rows = []
    for new_sid, (intent, example, _) in enumerate(train_meta):
        rows = example_to_token_rows_bio(intent, example, new_sid)
        train_rows.extend(rows)

    # Convert val split
    val_rows = []
    offset = len(train_meta)
    for new_sid, (intent, example, _) in enumerate(val_meta):
        rows = example_to_token_rows_bio(intent, example, offset + new_sid)
        val_rows.extend(rows)

    return pd.DataFrame(train_rows), pd.DataFrame(val_rows)

In [None]:
# Build split (test)

def build_test():
    all_rows = []
    sentence_id = 0

    for intent, path in iter_snips_files("test"):
        examples = load_examples(path)
        for ex in examples:
            rows = example_to_token_rows_bio(intent, ex, sentence_id)
            all_rows.extend(rows)
            sentence_id += 1

    return pd.DataFrame(all_rows)

In [None]:
# Generate final CSVs

train_df, val_df = build_train_val()
test_df = build_test()

print("Train NER:", train_df.shape)
print("Val   NER:", val_df.shape)
print("Test  NER:", test_df.shape)

train_df.to_csv("dataset/snips_ner_train.csv", index=False)
val_df.to_csv("dataset/snips_ner_val.csv", index=False)
test_df.to_csv("dataset/snips_ner_test.csv", index=False)

train_df.head()

Train NER: (110572, 6)
Val   NER: (12191, 6)
Test  NER: (6318, 6)


Unnamed: 0,sentence_id,token_id,intent,utterance,token,slot
0,0,0,PlayMusic,"Play the playlist, A Mis Niños de 30.",Play,O
1,0,1,PlayMusic,"Play the playlist, A Mis Niños de 30.",the,O
2,0,2,PlayMusic,"Play the playlist, A Mis Niños de 30.",playlist,O
3,0,3,PlayMusic,"Play the playlist, A Mis Niños de 30.",A,B-playlist
4,0,4,PlayMusic,"Play the playlist, A Mis Niños de 30.",Mis,I-playlist


In [14]:
def count_entities_bio(df, slot_name):
    counts = 0
    for sid, group in df.groupby("sentence_id"):
        prev = "O"
        for tag in group["slot"]:
            if tag.startswith(f"B-{slot_name}"):
                counts += 1
    return counts

print("Album entities in test_df:", count_entities_bio(test_df, "poi"))
print("Album entities in test_df:", count_entities_bio(train_df, "track"))

Album entities in test_df: 6
Album entities in test_df: 195
