<a href="https://colab.research.google.com/github/srnarasim/TAOExperiment/blob/main/TAOExperiment_fixed_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TAO Experiment - Text Classification with Test-time Adaptation

In [1]:
# Create a CSV file with sample content

import csv

data = [
    ['Product', 'Product Description', 'Category'],
    ['Wireless Bluetooth headphones with noise cancellation', 'Headphones', 'Electronics'],
    ['Smartphone with OLED display and 128GB storage', 'Smartphone', 'Electronics'],
    ['Gaming laptop with high refresh rate screen', 'Laptop', 'Electronics'],
    ['Smart home security camera with night vision', 'Smart Home Device', 'Electronics'],
    ['Cotton t-shirt with graphic print design', 'T-shirt', 'Clothing'],
    ['Wooden dining table with six matching chairs', 'Dining Table', 'Furniture'],
    ['Genuine leather wallet with multiple card slots', 'Wallet', 'Accessories'],
    ['Insulated stainless steel water bottle', 'Water Bottle', 'Kitchen']
]

with open('balanced_data.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(data)

!cat balanced_data.csv

Product,Product Description,Category
Wireless Bluetooth headphones with noise cancellation,Headphones,Electronics
Smartphone with OLED display and 128GB storage,Smartphone,Electronics
Gaming laptop with high refresh rate screen,Laptop,Electronics
Smart home security camera with night vision,Smart Home Device,Electronics
Cotton t-shirt with graphic print design,T-shirt,Clothing
Wooden dining table with six matching chairs,Dining Table,Furniture
Genuine leather wallet with multiple card slots,Wallet,Accessories
Insulated stainless steel water bottle,Water Bottle,Kitchen


In [2]:
!pip install datasets transformers accelerate bitsandbytes

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_c

In [3]:
import pandas as pd
from transformers import AutoTokenizer

# Load dataset
df = pd.read_csv("balanced_data.csv")

# Load tokenizer (using BERT model)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create category mapping
category_mapping = {category: idx for idx, category in enumerate(df["Category"].unique())}
df["Label"] = df["Category"].map(category_mapping)

# Tokenize product descriptions
max_length = 128
encoded_data = tokenizer(
    df["Product Description"].tolist(),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

# Store tokenized data in DataFrame
df["input_ids"] = encoded_data["input_ids"].tolist()
df["attention_mask"] = encoded_data["attention_mask"].tolist()

print("Tokenization completed. DataFrame columns:", df.columns.tolist())

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenization completed. DataFrame columns: ['Product', 'Product Description', 'Category', 'Label', 'input_ids', 'attention_mask']


In [13]:
import torch
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, default_data_collator

# ... (rest of your code) ...

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # Ensure this matches the metric returned by compute_metrics
    # ... (rest of your training arguments) ...
)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate and return the accuracy
    return {"accuracy": (preds == labels).mean()}

# Initialize trainer, include compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test["train"],
    eval_dataset=train_test["test"],
    compute_metrics=compute_metrics # Pass the function to the Trainer
)

# Train the model
print("Starting training...")
trainer.train()



Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.711812,0.5
2,No log,1.777699,0.5
3,No log,1.784575,0.5
4,No log,1.796745,0.5
5,No log,1.818792,0.5


TrainOutput(global_step=10, training_loss=1.3477643013000489, metrics={'train_runtime': 106.661, 'train_samples_per_second': 0.281, 'train_steps_per_second': 0.094, 'total_flos': 77085393300.0, 'train_loss': 1.3477643013000489, 'epoch': 5.0})

## Test-time Adaptation for New Categories
This section implements test-time adaptation to detect products from new, unseen categories.

In [14]:
import torch.nn.functional as F
import numpy as np

def predict_product_category(text, entropy_threshold=1.5, confidence_threshold=0.4):
    # Tokenize input
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

        # Convert to probabilities using softmax
        probabilities = F.softmax(logits, dim=-1).cpu().numpy()[0]

        # Calculate entropy
        entropy = -np.sum(probabilities * np.log(probabilities + 1e-9))

        # Calculate confidence
        confidence = np.max(probabilities)

        # Make prediction
        if entropy > entropy_threshold or confidence < confidence_threshold:
            print(f"Product '{text}' might be a NEW category!")
            print(f"Entropy: {entropy:.3f}, Confidence: {confidence:.3f}")
            return 'New Category'
        else:
            predicted_idx = np.argmax(probabilities)
            category = list(category_mapping.keys())[predicted_idx]
            print(f"Product '{text}' classified as: {category}")
            print(f"Confidence: {confidence:.3f}, Entropy: {entropy:.3f}")
            return category

# Test with various product descriptions
test_products = [
    # Known categories
    "Wireless gaming headphones with RGB lighting",
    "Wooden dining table with extendable leaf",
    "Classic leather wallet with coin pocket",
    "Cotton polo shirt with embroidered logo",

    # Potentially new categories
    "Smart fitness tracker with heart rate monitor",
    "Electric scooter with foldable design",
    "Organic green tea from Japan",
    "Professional oil painting set with easel",
    "Garden tools set with pruning shears",
    "Yoga mat with alignment lines"
]

print("Testing product classification with test-time adaptation:\n")
results = []
for product in test_products:
    category = predict_product_category(product)
    results.append({'Product': product, 'Predicted Category': category})
    print("-" * 80 + "\n")

# Display results in a DataFrame
results_df = pd.DataFrame(results)
print("\nSummary of Results:")
print(results_df)

Testing product classification with test-time adaptation:

Product 'Wireless gaming headphones with RGB lighting' might be a NEW category!
Entropy: 1.541, Confidence: 0.290
--------------------------------------------------------------------------------

Product 'Wooden dining table with extendable leaf' might be a NEW category!
Entropy: 1.545, Confidence: 0.278
--------------------------------------------------------------------------------

Product 'Classic leather wallet with coin pocket' might be a NEW category!
Entropy: 1.553, Confidence: 0.272
--------------------------------------------------------------------------------

Product 'Cotton polo shirt with embroidered logo' might be a NEW category!
Entropy: 1.557, Confidence: 0.277
--------------------------------------------------------------------------------

Product 'Smart fitness tracker with heart rate monitor' might be a NEW category!
Entropy: 1.540, Confidence: 0.287
--------------------------------------------------------

In [None]:
# Create a proper dataset from the tokenized data
from datasets import Dataset

# Convert lists stored as strings back to actual lists if needed
import ast

# Create a function to safely convert string representations of lists to actual lists
def safe_eval(x):
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return x

# Prepare the dataset
dataset_dict = {
    'input_ids': [safe_eval(ids) for ids in df['input_ids']],
    'attention_mask': [safe_eval(mask) for mask in df['attention_mask']],
    'labels': df['Label'].tolist()
}

# Create the dataset
new_dataset = Dataset.from_dict(dataset_dict)
print(f"Created dataset with {len(new_dataset)} examples")
print(f"Dataset features: {new_dataset.features}")


In [16]:
from peft import get_peft_model, LoraConfig
from transformers import default_data_collator, Trainer, TrainingArguments

# ... (rest of your code) ...

# Reinitialize trainer with updated training_args and data_collator, pass compute_metrics to the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_dataset,  # Use new_dataset for fine-tuning
    eval_dataset=new_dataset,   # Use new_dataset for evaluation as well
    data_collator=default_data_collator,  # Ensure proper data collation
    compute_metrics=lambda pred: {'accuracy': (pred.predictions.argmax(-1) == pred.label_ids).mean()} # Pass compute_metrics here
)

# Modify the compute_loss method to accept num_items_in_batch
def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): # Add num_items_in_batch as an argument
    """
    How the loss is computed by Trainer. By default, all models return the loss in the first element.
    Subclass and override for custom behavior.
    """
    if self.label_smoother is not None and "labels" in inputs:
        labels = inputs.pop("labels")
    else:
        labels = None

    # Remove num_items_in_batch from inputs if it exists to avoid issues with the model
    inputs.pop("num_items_in_batch", None)

    outputs = model(**inputs)

    # Save past state if it exists
    # TODO: this needs to be fixed and made cleaner later.
    if self.args.past_index >= 0:
        self._past_state = outputs[self.args.past_index]

    if labels is not None:
        loss = self.label_smoother(outputs, labels)
    else:
        # We don't use .loss here since the model may return tuples instead of ModelOutput.
        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

    return (loss, outputs) if return_outputs else loss

# Assign the modified compute_loss method to the trainer
trainer.compute_loss = compute_loss.__get__(trainer) # type: ignore

trainer.train()

ValueError: You have to specify either input_ids or inputs_embeds