In [26]:
import pandas as pd
df_split = pd.read_csv("/Users/ramana/Documents/shopper_analysis/notebooks/2TrainingInput/training_2025-04-26_23-16-06.csv")

In [27]:
from transformers import BertTokenizer

# Load the tokenizer for the BERT base model (uncased = lowercase)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize a sample to understand the format
example = df_split['cleaned_product_name'].iloc[0]
encoded = tokenizer(
    example,
    padding='max_length',       # pad to max_length
    truncation=True,            # truncate if too long
    max_length=64,              # max length of a product name
    return_tensors='pt'         # return PyTorch tensors
)

print("Original Text:", example)
print("Input IDs:", encoded['input_ids'])
print("Attention Mask:", encoded['attention_mask'])

Original Text: egg incubator digital mini 7 eggs hatching machine poultry hatcher for chicken duck goose quail birds manual egg turning and humidity control 110v
Input IDs: tensor([[  101,  8288,  4297, 19761,  4263,  3617,  7163,  1021,  6763, 11300,
          2075,  3698, 22468, 11300,  2121,  2005,  7975,  9457, 13020, 24209,
         12502,  5055,  6410,  8288,  3810,  1998, 18213,  2491,  7287,  2615,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [28]:
from datasets import Dataset, DatasetDict

# Step 1: Split DataFrame into 3 subsets
train_df = df_split[df_split['split_type'] == 'train']
val_df = df_split[df_split['split_type'] == 'val']
test_df = df_split[df_split['split_type'] == 'test']

# Step 2: Convert pandas DataFrames to Hugging Face Datasets
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "val": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})

# Step 3: Tokenization function for all rows
def tokenize_function(example):
    return tokenizer(
        example["cleaned_product_name"],
        padding="max_length",
        truncation=True,
        max_length=64
    )

# Step 4: Apply tokenizer to each row in each split
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 5: Rename your label column to 'labels' (what BERT expects)
tokenized_dataset = tokenized_dataset.rename_column("relevant_code_binary", "labels")

# Step 6: Set format to PyTorch
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Quick check
print(tokenized_dataset["train"][0])

Map: 100%|████████████████████████████████████████████████████████████████████| 87997/87997 [00:15<00:00, 5519.50 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████| 11000/11000 [00:01<00:00, 5563.89 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████| 11000/11000 [00:01<00:00, 5692.16 examples/s]

{'labels': tensor(0), 'input_ids': tensor([  101,  8288,  4297, 19761,  4263,  3617,  7163,  1021,  6763, 11300,
         2075,  3698, 22468, 11300,  2121,  2005,  7975,  9457, 13020, 24209,
        12502,  5055,  6410,  8288,  3810,  1998, 18213,  2491,  7287,  2615,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}





In [29]:
tokenized_dataset.save_to_disk("tokenized_dataset/")

Saving the dataset (1/1 shards): 100%|█████████████████████████████████████| 87997/87997 [00:00<00:00, 1441557.02 examples/s]
Saving the dataset (1/1 shards): 100%|█████████████████████████████████████| 11000/11000 [00:00<00:00, 1345739.82 examples/s]
Saving the dataset (1/1 shards): 100%|█████████████████████████████████████| 11000/11000 [00:00<00:00, 1570312.24 examples/s]
