# TAO Experiment - Text Classification with Test-time Adaptation

In [None]:
# Create a CSV file with sample content

import csv

data = [
    ['Product', 'Product Description', 'Category'],
    ['Wireless Bluetooth headphones with noise cancellation', 'Headphones', 'Electronics'],
    ['Smartphone with OLED display and 128GB storage', 'Smartphone', 'Electronics'],
    ['Gaming laptop with high refresh rate screen', 'Laptop', 'Electronics'],
    ['Smart home security camera with night vision', 'Smart Home Device', 'Electronics'],
    ['Cotton t-shirt with graphic print design', 'T-shirt', 'Clothing'],
    ['Wooden dining table with six matching chairs', 'Dining Table', 'Furniture'],
    ['Genuine leather wallet with multiple card slots', 'Wallet', 'Accessories'],
    ['Insulated stainless steel water bottle', 'Water Bottle', 'Kitchen']
]

with open('balanced_data.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(data)

!cat balanced_data.csv

In [None]:
!pip install datasets transformers accelerate bitsandbytes

In [None]:
import pandas as pd
from transformers import AutoTokenizer

# Load dataset
df = pd.read_csv("balanced_data.csv")

# Load tokenizer (using BERT model)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create category mapping
category_mapping = {category: idx for idx, category in enumerate(df["Category"].unique())}
df["Label"] = df["Category"].map(category_mapping)

# Tokenize product descriptions
max_length = 128
encoded_data = tokenizer(
    df["Product Description"].tolist(),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

# Store tokenized data in DataFrame
df["input_ids"] = encoded_data["input_ids"].tolist()
df["attention_mask"] = encoded_data["attention_mask"].tolist()

print("Tokenization completed. DataFrame columns:", df.columns.tolist())

In [None]:
import torch
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Create dataset
dataset_dict = {
    'input_ids': encoded_data['input_ids'].numpy(),
    'attention_mask': encoded_data['attention_mask'].numpy(),
    'labels': df['Label'].values
}

dataset = Dataset.from_dict(dataset_dict)
train_test = dataset.train_test_split(test_size=0.2)

# Initialize model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(category_mapping)
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test["train"],
    eval_dataset=train_test["test"]
)

# Train the model
print("Starting training...")
trainer.train()

## Test-time Adaptation for New Categories
This section implements test-time adaptation to detect products from new, unseen categories.

In [None]:
import torch.nn.functional as F
import numpy as np

def predict_product_category(text, entropy_threshold=1.5, confidence_threshold=0.4):
    # Tokenize input
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Convert to probabilities using softmax
        probabilities = F.softmax(logits, dim=-1).cpu().numpy()[0]
        
        # Calculate entropy
        entropy = -np.sum(probabilities * np.log(probabilities + 1e-9))
        
        # Calculate confidence
        confidence = np.max(probabilities)
        
        # Make prediction
        if entropy > entropy_threshold or confidence < confidence_threshold:
            print(f"Product '{text}' might be a NEW category!")
            print(f"Entropy: {entropy:.3f}, Confidence: {confidence:.3f}")
            return 'New Category'
        else:
            predicted_idx = np.argmax(probabilities)
            category = list(category_mapping.keys())[predicted_idx]
            print(f"Product '{text}' classified as: {category}")
            print(f"Confidence: {confidence:.3f}, Entropy: {entropy:.3f}")
            return category

# Test with various product descriptions
test_products = [
    # Known categories
    "Wireless gaming headphones with RGB lighting",
    "Wooden dining table with extendable leaf",
    "Classic leather wallet with coin pocket",
    "Cotton polo shirt with embroidered logo",
    
    # Potentially new categories
    "Smart fitness tracker with heart rate monitor",
    "Electric scooter with foldable design",
    "Organic green tea from Japan",
    "Professional oil painting set with easel",
    "Garden tools set with pruning shears",
    "Yoga mat with alignment lines"
]

print("Testing product classification with test-time adaptation:\n")
results = []
for product in test_products:
    category = predict_product_category(product)
    results.append({'Product': product, 'Predicted Category': category})
    print("-" * 80 + "\n")

# Display results in a DataFrame
results_df = pd.DataFrame(results)
print("\nSummary of Results:")
print(results_df)