In [None]:
import os
import json
import pandas as pd
from glob import glob
import re

In [None]:
# Normalizes whitespace in an utterance

def clean_text(s):
    s = re.sub(r"\s+", " ", s)
    return s.strip()

# Safely loads a JSON file while ignoring decoding errors

def safe_json_load(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    return json.loads(text)

In [None]:
# Loads SNIPS training JSON files, extracts intent + cleaned utterance text

def load_train():
    rows = []
    pattern = "dataset/*/train_*_full.json"

    for path in glob(pattern):
        filename = os.path.basename(path)
        intent = filename.split("_", 1)[1].replace("_full.json", "")

        content = safe_json_load(path)

        # Should be { "AddToPlaylist": [ { "data": [...] }, ... ] }
        examples = content.get(intent)
        if examples is None:
            examples = next(iter(content.values()))

        for ex in examples:
            data_list = ex["data"]
            raw_text = " ".join(chunk["text"] for chunk in data_list)
            text = clean_text(raw_text)
            rows.append({"text": text, "intent": intent})

    return rows

In [None]:
# loads SNIPS test utterances and intents from validation JSON files

def load_test():
    rows = []
    pattern = "dataset/*/validate_*.json"

    for path in glob(pattern):
        filename = os.path.basename(path)
        if filename.endswith("_full.json"):
            continue

        intent = filename.replace("validate_", "").replace(".json", "")

        content = safe_json_load(path)

        examples = content.get(intent)
        if examples is None:
            examples = next(iter(content.values()))

        for ex in examples:
            data_list = ex["data"]
            raw_text = " ".join(chunk["text"] for chunk in data_list)
            text = clean_text(raw_text)
            rows.append({"text": text, "intent": intent})

    return rows


In [None]:
# Build CSVs

train_rows = load_train()
df_train = pd.DataFrame(train_rows)
df_train.to_csv("dataset/snips_intent_train.csv", index=False)
print("✓ Train CSV created:", df_train.shape)

test_rows = load_test()
df_test = pd.DataFrame(test_rows)
df_test.to_csv("dataset/snips_intent_test.csv", index=False)
print("✓ Test CSV created:", df_test.shape)

df_train.head(), df_test.head()