In [1]:
pip install transformers accelerate


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
def classify_stage(row):
    overlap = row["topics_overlap"]
    topic = row["topic_label"]
    sentiment = row["sentiment"]
    emotional_tone = row["final_consolidated_emotional_tone"]

    if overlap >= 0.2:
        return "Post-Purchase"
    if 0.1 <= overlap < 0.2:
        if topic in ["Connectivity & Portability", "Quality & Reviews"]:
            return "Consideration"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"
    if overlap < 0.1:
        if topic in ["Design & Usability", "Protection & Packaging"]:
            return "Awareness"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"

    if emotional_tone == "Positive" and sentiment == "Positive":
        return "Post-Purchase"
    elif emotional_tone == "Neutral" or sentiment == "Neutral":
        return "Consideration"
    elif emotional_tone == "Negative" or sentiment == "Negative":
        return "Decision"
    elif emotional_tone == "Mixed":
        return "Awareness"

    # Default fallback changed to "Awareness"
    return "Awareness"


In [3]:
import pandas as pd

# Load dataset
file_path = "dataset_with_topic_labels.csv"  # Replace with the actual path to your dataset
df = pd.read_csv(file_path)

# Display basic information
print(df.head())  # Inspect the first few rows of the dataset
print(df.columns)  # View column names to ensure all required columns exist


   product_id                                      product_title  \
0  B07CZDXDG8  INIU Portable Charger, Slimmest 10000mAh 5V/3A...   
1  B07CZDXDG8  INIU Portable Charger, Slimmest 10000mAh 5V/3A...   
2  B07CZDXDG8  INIU Portable Charger, Slimmest 10000mAh 5V/3A...   
3  B07CZDXDG8  INIU Portable Charger, Slimmest 10000mAh 5V/3A...   
4  B07CZDXDG8  INIU Portable Charger, Slimmest 10000mAh 5V/3A...   

                                 product_description product_price  \
0  From INIU--the SAFE Fast Charge Pro: Experienc...        $19.99   
1  From INIU--the SAFE Fast Charge Pro: Experienc...        $19.99   
2  From INIU--the SAFE Fast Charge Pro: Experienc...        $19.99   
3  From INIU--the SAFE Fast Charge Pro: Experienc...        $19.99   
4  From INIU--the SAFE Fast Charge Pro: Experienc...        $19.99   

                                         product_url  \
0  https://www.amazon.com/INIU-High-Speed-Flashli...   
1  https://www.amazon.com/INIU-High-Speed-Flashli...   
2 

In [4]:
# Apply the classification function to create a new column 'stage'
df["stage"] = df.apply(classify_stage, axis=1)

# Display the updated dataframe and class distribution
print(df.head())  # Inspect the first few rows to verify the 'stage' column
print(df["stage"].value_counts())  # View distribution of classes


   product_id                                      product_title  \
0  B07CZDXDG8  INIU Portable Charger, Slimmest 10000mAh 5V/3A...   
1  B07CZDXDG8  INIU Portable Charger, Slimmest 10000mAh 5V/3A...   
2  B07CZDXDG8  INIU Portable Charger, Slimmest 10000mAh 5V/3A...   
3  B07CZDXDG8  INIU Portable Charger, Slimmest 10000mAh 5V/3A...   
4  B07CZDXDG8  INIU Portable Charger, Slimmest 10000mAh 5V/3A...   

                                 product_description product_price  \
0  From INIU--the SAFE Fast Charge Pro: Experienc...        $19.99   
1  From INIU--the SAFE Fast Charge Pro: Experienc...        $19.99   
2  From INIU--the SAFE Fast Charge Pro: Experienc...        $19.99   
3  From INIU--the SAFE Fast Charge Pro: Experienc...        $19.99   
4  From INIU--the SAFE Fast Charge Pro: Experienc...        $19.99   

                                         product_url  \
0  https://www.amazon.com/INIU-High-Speed-Flashli...   
1  https://www.amazon.com/INIU-High-Speed-Flashli...   
2 

In [5]:
from sklearn.model_selection import train_test_split

# Define input text and labels
texts = df["processed_text"]  # Ensure 'processed_text' contains your input text
labels = df["stage"]  # Target labels (stage)

# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Print sample sizes
print(f"Training Samples: {len(train_texts)}")
print(f"Validation Samples: {len(val_texts)}")


Training Samples: 622
Validation Samples: 156


In [6]:
# Create label map
label_map = {label: i for i, label in enumerate(df["stage"].unique())}

# Print the label map to verify
print("Label Map:", label_map)


Label Map: {'Decision': 0, 'Post-Purchase': 1, 'Awareness': 2, 'Consideration': 3}


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")

# Add padding token if it's not present
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the model
model_name = "meta-llama/Llama-3.2-3B"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))

# Resize model embeddings for the updated tokenizer
model.resize_token_embeddings(len(tokenizer))

# Enable gradient checkpointing
model.gradient_checkpointing_enable()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [8]:
import torch

# Tokenization function
def tokenize_data(texts, labels, tokenizer, max_length=256):
    tokenized = tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    mapped_labels = [label_map[label] for label in labels]  # Map labels to integers
    tokenized["labels"] = torch.tensor(mapped_labels)  # Convert labels to tensor
    return tokenized

# Tokenize training and validation data
train_encodings = tokenize_data(train_texts, train_labels, tokenizer)
val_encodings = tokenize_data(val_texts, val_labels, tokenizer)


In [9]:
from datasets import Dataset

# Function to convert tokenized data to Hugging Face Dataset
def create_hf_dataset(encodings):
    dataset_dict = {key: encodings[key] for key in encodings if key != "labels"}
    dataset_dict["labels"] = encodings["labels"]
    return Dataset.from_dict(dataset_dict)

# Create datasets
train_dataset = create_hf_dataset(train_encodings)
val_dataset = create_hf_dataset(val_encodings)

# Verify the datasets
print("Training Dataset Sample:", train_dataset[0])
print("Validation Dataset Sample:", val_dataset[0])


Training Dataset Sample: {'input_ids': [128000, 55397, 22658, 2853, 53421, 304, 23960, 18991, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': 2}
Validation Dataset Sample: {'input_ids': [128000, 9533, 18991, 1579, 51842, 1579, 22867, 2612, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': 2}


In [10]:
pip install deepspeed


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        evaluation_strategy='epoch',
        save_strategy='no',
        logging_dir='./logs',
        logging_steps=10,
        learning_rate=2e-5,
        report_to="none"
    )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"Device Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")


CUDA Available: True
Device Name: NVIDIA A100-SXM4-40GB


In [None]:
bert

In [18]:
import os
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification

# Ensure CUDA paths are set
os.environ["CUDA_HOME"] = "/usr/local/cuda-12.3"
os.environ["PATH"] = f"{os.environ['CUDA_HOME']}/bin:{os.environ['PATH']}"
os.environ["LD_LIBRARY_PATH"] = f"{os.environ['CUDA_HOME']}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}"

# Verify CUDA availability
print("CUDA Available:", torch.cuda.is_available())
print("Device Name:", torch.cuda.get_device_name(0))

# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Tokenizer update for padding
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    fp16=True,  # Enable mixed precision
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"  # Avoid external logging
)

# Dummy datasets (replace with actual datasets)
from datasets import Dataset
data = {"input_ids": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]}
train_dataset = Dataset.from_dict(data)
eval_dataset = Dataset.from_dict(data)

# Define compute metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train and Evaluate
trainer.train()
results = trainer.evaluate()
print("Evaluation Results:", results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Label Map: {'Decision': 0, 'Post-Purchase': 1, 'Awareness': 2, 'Consideration': 3}


  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
/software/slurm/spackages/linux-rocky8-x86_64/gcc-12.2.0/anaconda3-2023.09-0-3mhml42fa64byxqyd5fig5tbih625dp2/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


MissingCUDAException: CUDA_HOME does not exist, unable to compile CUDA op(s)