# TAO Experiment - Text Classification

In [None]:
# Create a CSV file with sample content

import csv

data = [
    ['Product', 'Product Description', 'Category'],
    ['Wireless Bluetooth headphones with noise cancellation', 'Headphones', 'Electronics'],
    ['Smartphone with OLED display and 128GB storage', 'Smartphone', 'Electronics'],
    ['Gaming laptop with high refresh rate screen', 'Laptop', 'Electronics'],
    ['Smart home security camera with night vision', 'Smart Home Device', 'Electronics']
]

with open('output.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(data)

!cat output.csv

In [None]:
!pip install datasets
!pip install transformers accelerate bitsandbytes

In [None]:
import pandas as pd
from transformers import AutoTokenizer

# Load dataset
df = pd.read_csv("output.csv")

# Load tokenizer (using BERT model which is publicly available)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert category labels to numeric values
category_mapping = {category: idx for idx, category in enumerate(df["Category"].unique())}
df["Label"] = df["Category"].map(category_mapping)

# Explicitly set a max_length value
max_length = 128  # Choose an appropriate value for your texts

# Tokenize product descriptions
encoded_data = tokenizer(
    df["Product Description"].tolist(),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

# Store tokenized data in the DataFrame
df["input_ids"] = encoded_data["input_ids"].tolist()
df["attention_mask"] = encoded_data["attention_mask"].tolist()

print("Tokenization completed. DataFrame columns:", df.columns.tolist())

In [None]:
# Display the DataFrame with tokenized data and labels
df[["Product Description", "Category", "Label", "input_ids", "attention_mask"]]

In [None]:
import torch
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Create the dataset
dataset = Dataset.from_pandas(df)

# Split dataset into train and validation sets
dataset_split = dataset.train_test_split(test_size=0.2)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(category_mapping))

# Define compute_metrics function for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()