<a href="https://colab.research.google.com/github/srnarasim/TAOExperiment/blob/main/TAOExperiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: Create a CSV file with give content

import csv

data = [
    ['Product', 'Product Description', 'Category'],
    ['Wireless Bluetooth headphones with noise cancellation', 'Headphones', 'Electronics'],
    ['Smartphone with OLED display and 128GB storage', 'Smartphone', 'Electronics'],
    ['Gaming laptop with high refresh rate screen', 'Laptop', 'Electronics'],
    ['Smart home security camera with night vision', 'Smart Home Device', 'Electronics']
]

with open('output.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(data)

!cat output.csv


Product,Product Description,Category
Wireless Bluetooth headphones with noise cancellation,Headphones,Electronics
Smartphone with OLED display and 128GB storage,Smartphone,Electronics
Gaming laptop with high refresh rate screen,Laptop,Electronics
Smart home security camera with night vision,Smart Home Device,Electronics


In [2]:
!pip install datasets
!pip install transformers accelerate bitsandbytes



In [7]:
import pandas as pd
from transformers import AutoTokenizer

# Load dataset
df = pd.read_csv("output.csv")

# Load tokenizer (using LLaMA model)
model_name = "meta-llama/Llama-2-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert category labels to numeric values
category_mapping = {category: idx for idx, category in enumerate(df["Category"].unique())}
df["Label"] = df["Category"].map(category_mapping)

# Explicitly set a max_length value
max_length = 128  # Choose an appropriate value for your texts

# Tokenize product descriptions
encoded_data = tokenizer(
    df["Product Description"].tolist(),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

# Now you can access encoded_data["input_ids"] and encoded_data["attention_mask"]

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [4]:
df[["tokenized", "Label"]]

Unnamed: 0,tokenized,Label
0,"[input_ids, attention_mask]",0
1,"[input_ids, attention_mask]",0
2,"[input_ids, attention_mask]",0
3,"[input_ids, attention_mask]",0


In [5]:

import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# First, we need to modify how we handle the tokenized data
# Instead of keeping BatchEncoding objects in the DataFrame, let's expand them

# Assuming df contains your data with the tokenized column

# Extract and expand the tokenized features
input_ids_list = []
attention_mask_list = []

for item in df['tokenized']:
    input_ids_list.append(item['input_ids'])
    attention_mask_list.append(item['attention_mask'])

# Add these as separate columns in the DataFrame
df['input_ids'] = input_ids_list
df['attention_mask'] = attention_mask_list

# Now create the dataset without the problematic tokenized column
dataset = Dataset.from_pandas(df[['input_ids', 'attention_mask', 'Label', 'Product', 'Product Description', 'Category']])

# Split dataset into train and validation sets
dataset_split = dataset.train_test_split(test_size=0.2)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(category_mapping))

# Define compute_metrics function for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

OSError: meta-llama/Llama-2-7b does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.