In [1]:
import pandas as pd
import json
import re
import random

from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer

import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load CSVs

train_df = pd.read_csv("dataset/snips_intent_train.csv")
test_df  = pd.read_csv("dataset/snips_intent_test.csv")

print("Train:", train_df.shape)
print("Test :", test_df.shape)

Train: (13784, 2)
Test : (700, 2)


In [3]:
# Normalize whitespace and ensure all text fields are clean strings

def clean_text(s):
    s = re.sub(r"\s+", " ", str(s))
    return s.strip()

train_df['text'] = train_df['text'].apply(clean_text)
test_df['text']  = test_df['text'].apply(clean_text)

train_df.head()

Unnamed: 0,text,intent
0,Add another song to the Cita Romántica playlist.,AddToPlaylist
1,add clem burke in my playlist Pre-Party R&B Jams,AddToPlaylist
2,Add Live from Aragon Ballroom to Trapeo,AddToPlaylist
3,add Unite and Win to my night out,AddToPlaylist
4,Add track to my Digster Future Hits,AddToPlaylist


In [4]:
# Encode intent labels as integers and save the label ↔ id mappings

le = LabelEncoder()

train_df['label'] = le.fit_transform(train_df['intent'])
test_df['label']  = le.transform(test_df['intent'])

num_labels = len(le.classes_)

# Save label mappings
label2id = {label: int(i) for i, label in enumerate(le.classes_)}
id2label = {int(i): label for i, label in enumerate(le.classes_)}

with open("label2id.json", "w") as f:
    json.dump(label2id, f, indent=2)

with open("id2label.json", "w") as f:
    json.dump(id2label, f, indent=2)

In [5]:
# Load tokenizer

#MODEL_NAME = "distilbert-base-uncased" #<- Use This for Distilbert
MODEL_NAME = "google-bert/bert-base-uncased" #<- Use This for BERT

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 44

In [6]:
# Tokenizes a list of texts into padded, truncated transformer inputs

def tokenize(df):
    return tokenizer(
        df["text"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

In [7]:
# PyTorch dataset wrapper for encoded SNIPS intent samples

class SnipsDataset(Dataset):
    def __init__(self, df):
        self.encodings = tokenize(df)
        self.labels = df["label"].tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
# Split training set into TRAIN + VAL

duplicates1 = train_df[train_df.duplicated(subset=["text", "intent"], keep=False)]
print(f"Duplicates in Train: {len(duplicates1)}")

duplicates2 = test_df[test_df.duplicated(subset=["text", "intent"], keep=False)]
print(f"Duplicates in Test: {len(duplicates2)}")

# Remove duplicates within.
train_df = train_df.drop_duplicates(subset=["text", "intent"]).reset_index(drop=True)
test_df  = test_df.drop_duplicates(subset=["text", "intent"]).reset_index(drop=True)

# Remove Duplicates Across Train and Test
test_pairs = set(zip(test_df["text"], test_df["intent"]))

train_df = train_df[
    ~train_df.apply(lambda row: (row["text"], row["intent"]) in test_pairs, axis=1)
].reset_index(drop=True)

# Then split training into train + val
train_df, val_df = train_test_split(
    train_df,
    test_size=0.1,
    stratify=train_df["label"],
    random_state=42
)

print("New splits:")
print("Train:", train_df.shape)
print("Val  :", val_df.shape)
print("Test :", test_df.shape)

val_df["intent"].value_counts()

Duplicates in Train: 289
Duplicates in Test: 6
New splits:
Train: (12248, 3)
Val  : (1361, 3)
Test : (697, 3)


intent
GetWeather              199
PlayMusic               198
BookRestaurant          197
SearchCreativeWork      195
AddToPlaylist           194
RateBook                192
SearchScreeningEvent    186
Name: count, dtype: int64

In [9]:
# Helper function to list duplicates between two datasets

def find_overlap(df1, df2, name1="DF1", name2="DF2"):
    pairs1 = set(zip(df1["text"], df1["intent"]))
    pairs2 = set(zip(df2["text"], df2["intent"]))
    overlap = pairs1 & pairs2

    print(f"\n=== Overlap between {name1} and {name2}: {len(overlap)} ===")
    for text, intent in list(overlap):
        print(f"[{intent}] {text}")
    if len(overlap) > 25:
        print("... (truncated)")
    return overlap

# Check all combinations

overlap_train_val  = find_overlap(train_df, val_df,  "TRAIN", "VAL")
overlap_train_test = find_overlap(train_df, test_df, "TRAIN", "TEST")
overlap_val_test   = find_overlap(val_df,  test_df, "VAL",   "TEST")


=== Overlap between TRAIN and VAL: 0 ===

=== Overlap between TRAIN and TEST: 0 ===

=== Overlap between VAL and TEST: 0 ===


In [10]:
# Build datasets

train_dataset = SnipsDataset(train_df)
val_dataset   = SnipsDataset(val_df)
test_dataset  = SnipsDataset(test_df)

len(train_dataset), len(test_dataset), len(val_dataset)

(12248, 697, 1361)

In [11]:
# 9 Few Shot Generation (10, 20, 50, 70, 100 per intent)

SHOT_SIZES = [10, 20, 50, 70, 100]

def sample_few_shot(df, n_per_intent):
    return (
        df.groupby("intent", group_keys=False)
          .apply(lambda x: x.sample(n=min(n_per_intent, len(x)), random_state=42))
          .reset_index(drop=True)
    )

def build_and_save_pt(df, filename):
    dataset = SnipsDataset(df)
    torch.save(dataset, filename)
    print(f"✓ Saved {filename}  |  size={len(df)}")

import os
os.makedirs("fewshot_datasets", exist_ok=True)

for shot in SHOT_SIZES:
    print(f"\n=== Building {shot}-shot dataset ===")

    # sample from train split only
    df_small = sample_few_shot(train_df, shot)

    # Save CSV for inspection
    csv_path = f"fewshot_datasets/snips_train_{shot}.csv"
    df_small.to_csv(csv_path, index=False)

    # Save .pt file after tokenization
    pt_path = f"fewshot_datasets/snips_train_dataset_{shot}.pt"
    build_and_save_pt(df_small, pt_path)

print("\nFew-shot dataset generation complete.")


=== Building 10-shot dataset ===
✓ Saved fewshot_datasets/snips_train_dataset_10.pt  |  size=70

=== Building 20-shot dataset ===
✓ Saved fewshot_datasets/snips_train_dataset_20.pt  |  size=140

=== Building 50-shot dataset ===
✓ Saved fewshot_datasets/snips_train_dataset_50.pt  |  size=350

=== Building 70-shot dataset ===
✓ Saved fewshot_datasets/snips_train_dataset_70.pt  |  size=490

=== Building 100-shot dataset ===
✓ Saved fewshot_datasets/snips_train_dataset_100.pt  |  size=700

Few-shot dataset generation complete.


  .apply(lambda x: x.sample(n=min(n_per_intent, len(x)), random_state=42))
  .apply(lambda x: x.sample(n=min(n_per_intent, len(x)), random_state=42))
  .apply(lambda x: x.sample(n=min(n_per_intent, len(x)), random_state=42))
  .apply(lambda x: x.sample(n=min(n_per_intent, len(x)), random_state=42))
  .apply(lambda x: x.sample(n=min(n_per_intent, len(x)), random_state=42))


In [12]:
# Save tokenized datasets

torch.save(train_dataset, "dataset/snips_train_dataset.pt")
torch.save(val_dataset,   "dataset/snips_val_dataset.pt")
torch.save(test_dataset,  "dataset/snips_test_dataset.pt")

print("Saved: train / val / test .pt files")

Saved: train / val / test .pt files


In [13]:
from torch.serialization import add_safe_globals
from __main__ import SnipsDataset

add_safe_globals([SnipsDataset])

dataset = torch.load("dataset/snips_train_dataset.pt", weights_only=False)

with open("id2label.json", "r") as f:
    id2label = json.load(f)

In [14]:
seed = random.randint(0,len(train_df))
sample = dataset[seed]
sample

{'input_ids': tensor([  101,  2054,  6579,  5691,  2024,  1999,  1996, 10971,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor(6)}

In [15]:
text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
print(text)
label_id = int(sample['labels'])
label_text = id2label[str(label_id)]
print("Label:", label_text, f"({label_id})")

what animated movies are in the neighbourhood
Label: SearchScreeningEvent (6)
