bert

In [1]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from torch.utils.data import Dataset

# Disable DeepSpeed and CUDA if necessary
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Force CPU if needed

# Load dataset
file_path = "dataset_with_topic_labels.csv"
df = pd.read_csv(file_path)

def classify_stage(row):
    overlap = row["topics_overlap"]
    topic = row["topic_label"]
    sentiment = row["sentiment"]
    emotional_tone = row["final_consolidated_emotional_tone"]

    if overlap >= 0.2:
        return "Post-Purchase"
    if 0.1 <= overlap < 0.2:
        if topic in ["Connectivity & Portability", "Quality & Reviews"]:
            return "Consideration"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"
    if overlap < 0.1:
        if topic in ["Design & Usability", "Protection & Packaging"]:
            return "Awareness"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"

    if emotional_tone == "Positive" and sentiment == "Positive":
        return "Post-Purchase"
    elif emotional_tone == "Neutral" or sentiment == "Neutral":
        return "Consideration"
    elif emotional_tone == "Negative" or sentiment == "Negative":
        return "Decision"
    elif emotional_tone == "Mixed":
        return "Awareness"

    return "Awareness"

# Apply classify_stage
df["stage"] = df.apply(classify_stage, axis=1)

# Create label map
label_map = {label: i for i, label in enumerate(df["stage"].unique())}
print("Label Map:", label_map)

# Define custom dataset
class CommentsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_length)
        self.labels = [label_map[label] for label in labels]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["processed_text"], df["stage"], test_size=0.2, random_state=42
)

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))

train_dataset = CommentsDataset(train_texts, train_labels, tokenizer)
val_dataset = CommentsDataset(val_texts, val_labels, tokenizer)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
    }

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_dir="./logs",
    learning_rate=2e-5,
    fp16=False,  # Avoid mixed precision
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Train
print("Starting training...")
trainer.train()

# Evaluate
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

# Extract true and predicted labels
predictions, labels, _ = trainer.predict(val_dataset)
y_pred = predictions.argmax(-1)
y_true = labels

# Classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_map.keys()))

# Calculate metrics for "Awareness"
awareness_label = label_map["Awareness"]

# Total number of true "Awareness" samples
total_awareness = sum(y_true == awareness_label)

# Number of correct "Awareness" predictions
correct_awareness = sum((y_true == awareness_label) & (y_pred == awareness_label))

# Calculate precision, recall, and accuracy for "Awareness"
precision_awareness = correct_awareness / sum(y_pred == awareness_label) if sum(y_pred == awareness_label) > 0 else 0
recall_awareness = correct_awareness / total_awareness if total_awareness > 0 else 0
accuracy_awareness = correct_awareness / len(y_true)

print(f"\nMetrics for 'Awareness':")
print(f"Total 'Awareness' Samples: {total_awareness}")
print(f"Correct 'Awareness' Predictions: {correct_awareness}")
print(f"Precision: {precision_awareness:.2f}")
print(f"Recall: {recall_awareness:.2f}")
print(f"Accuracy: {accuracy_awareness:.2f}")


Label Map: {'Decision': 0, 'Post-Purchase': 1, 'Awareness': 2, 'Consideration': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.730952,0.769231,0.688025,0.626233,0.769231
2,No log,0.539022,0.801282,0.738543,0.780596,0.801282
3,No log,0.496934,0.833333,0.795813,0.816901,0.833333


  _warn_prf(average, modifier, msg_start, len(result))


Evaluation Metrics: {'eval_loss': 0.4969337582588196, 'eval_accuracy': 0.8333333333333334, 'eval_f1': 0.7958129611355418, 'eval_precision': 0.8169008929878495, 'eval_recall': 0.8333333333333334, 'eval_runtime': 4.0235, 'eval_samples_per_second': 38.773, 'eval_steps_per_second': 1.243, 'epoch': 3.0}

Classification Report:
               precision    recall  f1-score   support

     Decision       0.88      0.97      0.93        70
Post-Purchase       0.78      0.25      0.38        28
    Awareness       0.80      1.00      0.89        55
Consideration       0.00      0.00      0.00         3

     accuracy                           0.83       156
    macro avg       0.61      0.56      0.55       156
 weighted avg       0.82      0.83      0.80       156


Metrics for 'Awareness':
Total 'Awareness' Samples: 55
Correct 'Awareness' Predictions: 55
Precision: 0.80
Recall: 1.00
Accuracy: 0.35


Roberta


In [2]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report
from torch.utils.data import Dataset

# Load dataset
file_path = "dataset_with_topic_labels.csv"
df = pd.read_csv(file_path)

def classify_stage(row):
    overlap = row["topics_overlap"]
    topic = row["topic_label"]
    sentiment = row["sentiment"]
    emotional_tone = row["final_consolidated_emotional_tone"]

    if overlap >= 0.2:
        return "Post-Purchase"
    if 0.1 <= overlap < 0.2:
        if topic in ["Connectivity & Portability", "Quality & Reviews"]:
            return "Consideration"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"
    if overlap < 0.1:
        if topic in ["Design & Usability", "Protection & Packaging"]:
            return "Awareness"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"

    if emotional_tone == "Positive" and sentiment == "Positive":
        return "Post-Purchase"
    elif emotional_tone == "Neutral" or sentiment == "Neutral":
        return "Consideration"
    elif emotional_tone == "Negative" or sentiment == "Negative":
        return "Decision"
    elif emotional_tone == "Mixed":
        return "Awareness"

    return "Awareness"

# Apply classify_stage
df["stage"] = df.apply(classify_stage, axis=1)

# Create label map
label_map = {label: i for i, label in enumerate(df["stage"].unique())}
print("Label Map:", label_map)

# Define custom dataset
class CommentsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_length)
        self.labels = [label_map[label] for label in labels]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["processed_text"], df["stage"], test_size=0.2, random_state=42
)

# Switch to RoBERTa model
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))

train_dataset = CommentsDataset(train_texts, train_labels, tokenizer)
val_dataset = CommentsDataset(val_texts, val_labels, tokenizer)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
    }

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_dir="./logs",
    learning_rate=2e-5,
    fp16=False,  # Avoid mixed precision
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Train
print("Starting training...")
trainer.train()

# Evaluate
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

# Extract true and predicted labels
predictions, labels, _ = trainer.predict(val_dataset)
y_pred = predictions.argmax(-1)
y_true = labels

# Classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_map.keys()))

# Calculate metrics for "Awareness"
awareness_label = label_map["Awareness"]

# Total number of true "Awareness" samples
total_awareness = sum(y_true == awareness_label)

# Number of correct "Awareness" predictions
correct_awareness = sum((y_true == awareness_label) & (y_pred == awareness_label))

# Calculate precision, recall, and accuracy for "Awareness"
precision_awareness = correct_awareness / sum(y_pred == awareness_label) if sum(y_pred == awareness_label) > 0 else 0
recall_awareness = correct_awareness / total_awareness if total_awareness > 0 else 0
accuracy_awareness = correct_awareness / len(y_true)

print(f"\nMetrics for 'Awareness':")
print(f"Total 'Awareness' Samples: {total_awareness}")
print(f"Correct 'Awareness' Predictions: {correct_awareness}")
print(f"Precision: {precision_awareness:.2f}")
print(f"Recall: {recall_awareness:.2f}")
print(f"Accuracy: {accuracy_awareness:.2f}")


Label Map: {'Decision': 0, 'Post-Purchase': 1, 'Awareness': 2, 'Consideration': 3}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.606414,0.775641,0.694812,0.635058,0.775641
2,No log,0.483426,0.826923,0.793005,0.802831,0.826923
3,No log,0.465179,0.852564,0.829585,0.82708,0.852564


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Evaluation Metrics: {'eval_loss': 0.465178519487381, 'eval_accuracy': 0.8525641025641025, 'eval_f1': 0.8295851722141933, 'eval_precision': 0.8270800410767208, 'eval_recall': 0.8525641025641025, 'eval_runtime': 3.8464, 'eval_samples_per_second': 40.558, 'eval_steps_per_second': 1.3, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:
               precision    recall  f1-score   support

     Decision       0.87      0.99      0.93        70
Post-Purchase       0.75      0.43      0.55        28
    Awareness       0.85      0.95      0.90        55
Consideration       0.00      0.00      0.00         3

     accuracy                           0.85       156
    macro avg       0.62      0.59      0.59       156
 weighted avg       0.83      0.85      0.83       156


Metrics for 'Awareness':
Total 'Awareness' Samples: 55
Correct 'Awareness' Predictions: 52
Precision: 0.85
Recall: 0.95
Accuracy: 0.33


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load dataset
file_path = "dataset_with_topic_labels.csv"
df = pd.read_csv(file_path)

# Classification logic
def classify_stage(row):
    overlap = row["topics_overlap"]
    topic = row["topic_label"]
    sentiment = row["sentiment"]
    emotional_tone = row["final_consolidated_emotional_tone"]

    if overlap >= 0.2:
        return "Post-Purchase"
    if 0.1 <= overlap < 0.2:
        if topic in ["Connectivity & Portability", "Quality & Reviews"]:
            return "Consideration"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"
    if overlap < 0.1:
        if topic in ["Design & Usability", "Protection & Packaging"]:
            return "Awareness"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"

    if emotional_tone == "Positive" and sentiment == "Positive":
        return "Post-Purchase"
    elif emotional_tone == "Neutral" or sentiment == "Neutral":
        return "Consideration"
    elif emotional_tone == "Negative" or sentiment == "Negative":
        return "Decision"
    elif emotional_tone == "Mixed":
        return "Awareness"

    return "Awareness"

# Apply classify_stage
df["stage"] = df.apply(classify_stage, axis=1)

# Create label map
label_map = {label: i for i, label in enumerate(df["stage"].unique())}
print("Label Map:", label_map)

# Convert labels to numeric
df["stage_label"] = df["stage"].map(label_map)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["processed_text"], df["stage_label"], test_size=0.2, random_state=42
)

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, train_labels)

# Make predictions
y_pred = rf_model.predict(X_val)

# Evaluation metrics
print("\nClassification Report:")
print(classification_report(val_labels, y_pred, target_names=label_map.keys()))

print("\nConfusion Matrix:")
print(confusion_matrix(val_labels, y_pred))

# Overall metrics
overall_accuracy = accuracy_score(val_labels, y_pred)
overall_precision = precision_score(val_labels, y_pred, average="weighted")
overall_recall = recall_score(val_labels, y_pred, average="weighted")
overall_f1 = f1_score(val_labels, y_pred, average="weighted")

print("\nOverall Metrics:")
print(f"Accuracy: {overall_accuracy:.2f}")
print(f"Precision: {overall_precision:.2f}")
print(f"Recall: {overall_recall:.2f}")
print(f"F1-Score: {overall_f1:.2f}")


Label Map: {'Decision': 0, 'Post-Purchase': 1, 'Awareness': 2, 'Consideration': 3}

Classification Report:
               precision    recall  f1-score   support

     Decision       0.94      0.97      0.96        70
Post-Purchase       0.85      0.61      0.71        28
    Awareness       0.92      1.00      0.96        55
Consideration       0.25      0.33      0.29         3

     accuracy                           0.90       156
    macro avg       0.74      0.73      0.73       156
 weighted avg       0.90      0.90      0.90       156


Confusion Matrix:
[[68  1  1  0]
 [ 4 17  4  3]
 [ 0  0 55  0]
 [ 0  2  0  1]]

Overall Metrics:
Accuracy: 0.90
Precision: 0.90
Recall: 0.90
F1-Score: 0.90


In [4]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load dataset
file_path = "dataset_with_topic_labels.csv"
df = pd.read_csv(file_path)

# Classification logic
def classify_stage(row):
    overlap = row["topics_overlap"]
    topic = row["topic_label"]
    sentiment = row["sentiment"]
    emotional_tone = row["final_consolidated_emotional_tone"]

    if overlap >= 0.2:
        return "Post-Purchase"
    if 0.1 <= overlap < 0.2:
        if topic in ["Connectivity & Portability", "Quality & Reviews"]:
            return "Consideration"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"
    if overlap < 0.1:
        if topic in ["Design & Usability", "Protection & Packaging"]:
            return "Awareness"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"

    if emotional_tone == "Positive" and sentiment == "Positive":
        return "Post-Purchase"
    elif emotional_tone == "Neutral" or sentiment == "Neutral":
        return "Consideration"
    elif emotional_tone == "Negative" or sentiment == "Negative":
        return "Decision"
    elif emotional_tone == "Mixed":
        return "Awareness"

    return "Awareness"

# Apply classify_stage
df["stage"] = df.apply(classify_stage, axis=1)

# Create label map
label_map = {label: i for i, label in enumerate(df["stage"].unique())}
print("Label Map:", label_map)

# Convert labels to numeric
df["stage_label"] = df["stage"].map(label_map)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["processed_text"], df["stage_label"], test_size=0.2, random_state=42
)

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)

# Train SVM Classifier
svm_model = SVC(kernel="linear", probability=True, random_state=42)
svm_model.fit(X_train, train_labels)

# Make predictions
y_pred = svm_model.predict(X_val)

# Evaluation metrics
print("\nClassification Report:")
print(classification_report(val_labels, y_pred, target_names=label_map.keys()))

print("\nConfusion Matrix:")
print(confusion_matrix(val_labels, y_pred))

# Overall metrics
overall_accuracy = accuracy_score(val_labels, y_pred)
overall_precision = precision_score(val_labels, y_pred, average="weighted")
overall_recall = recall_score(val_labels, y_pred, average="weighted")
overall_f1 = f1_score(val_labels, y_pred, average="weighted")

print("\nOverall Metrics:")
print(f"Accuracy: {overall_accuracy:.2f}")
print(f"Precision: {overall_precision:.2f}")
print(f"Recall: {overall_recall:.2f}")
print(f"F1-Score: {overall_f1:.2f}")


Label Map: {'Decision': 0, 'Post-Purchase': 1, 'Awareness': 2, 'Consideration': 3}

Classification Report:
               precision    recall  f1-score   support

     Decision       0.93      0.97      0.95        70
Post-Purchase       0.72      0.46      0.57        28
    Awareness       0.83      0.98      0.90        55
Consideration       0.00      0.00      0.00         3

     accuracy                           0.87       156
    macro avg       0.62      0.60      0.60       156
 weighted avg       0.84      0.87      0.85       156


Confusion Matrix:
[[68  1  1  0]
 [ 5 13 10  0]
 [ 0  1 54  0]
 [ 0  3  0  0]]

Overall Metrics:
Accuracy: 0.87
Precision: 0.84
Recall: 0.87
F1-Score: 0.85


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure correct device usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
file_path = "dataset_with_topic_labels.csv"  # Replace with your dataset path
df = pd.read_csv(file_path)

# Define the stage classification logic
def classify_stage(row):
    overlap = row["topics_overlap"]
    topic = row["topic_label"]
    sentiment = row["sentiment"]
    emotional_tone = row["final_consolidated_emotional_tone"]

    if overlap >= 0.2:
        return "Post-Purchase"
    if 0.1 <= overlap < 0.2:
        if topic in ["Connectivity & Portability", "Quality & Reviews"]:
            return "Consideration"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"
    if overlap < 0.1:
        if topic in ["Design & Usability", "Protection & Packaging"]:
            return "Awareness"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"

    if emotional_tone == "Positive" and sentiment == "Positive":
        return "Post-Purchase"
    elif emotional_tone == "Neutral" or sentiment == "Neutral":
        return "Consideration"
    elif emotional_tone == "Negative" or sentiment == "Negative":
        return "Decision"
    elif emotional_tone == "Mixed":
        return "Awareness"

    return "Awareness"

# Apply classification logic
df["stage"] = df.apply(classify_stage, axis=1)

# Map labels to numerical values
label_map = {label: i for i, label in enumerate(df["stage"].unique())}
print("Label Map:", label_map)

# Split the dataset
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["processed_text"], df["stage"], test_size=0.2, random_state=42
)

# Convert labels to numerical values
train_labels = train_labels.map(label_map)
val_labels = val_labels.map(label_map)

# Load tokenizer and model
model_name = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add a padding token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("Added padding token '[PAD]' to tokenizer.")

# Load model and resize token embeddings
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_map)
).to(device)
model.resize_token_embeddings(len(tokenizer))

# Set `pad_token_id` in the model's configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Preprocess data
def preprocess_data(texts, labels, tokenizer, max_length=256):
    labels = labels.tolist()  # Ensure labels are a list
    encodings = tokenizer(
        list(texts),
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt",
    )
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": torch.tensor(labels, dtype=torch.long),
    }

train_data = preprocess_data(train_texts, train_labels, tokenizer)
val_data = preprocess_data(val_texts, val_labels, tokenizer)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

# Define evaluation metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Set batch size to 1 to avoid issues
    per_device_eval_batch_size=1,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_dir="./logs",
    learning_rate=2e-5,
    report_to="none",
    fp16=False,  # Avoid mixed precision for simplicity
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Train the model
print("Starting training...")
trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

# Analyze performance for a specific class (e.g., 'Awareness')
y_true = val_labels.tolist()
y_pred = trainer.predict(val_dataset).predictions.argmax(-1)

# Classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_map.keys()))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=label_map.keys(), yticklabels=label_map.keys())
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


Using device: cpu
Label Map: {'Decision': 0, 'Post-Purchase': 1, 'Awareness': 2, 'Consideration': 3}
Added padding token '[PAD]' to tokenizer.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting training...


Epoch,Training Loss,Validation Loss


In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load dataset
file_path = "dataset_with_topic_labels.csv"
df = pd.read_csv(file_path)

# Classification logic
def classify_stage(row):
    overlap = row["topics_overlap"]
    topic = row["topic_label"]
    sentiment = row["sentiment"]
    emotional_tone = row["final_consolidated_emotional_tone"]

    if overlap >= 0.2:
        return "Post-Purchase"
    if 0.1 <= overlap < 0.2:
        if topic in ["Connectivity & Portability", "Quality & Reviews"]:
            return "Consideration"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"
    if overlap < 0.1:
        if topic in ["Design & Usability", "Protection & Packaging"]:
            return "Awareness"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"

    if emotional_tone == "Positive" and sentiment == "Positive":
        return "Post-Purchase"
    elif emotional_tone == "Neutral" or sentiment == "Neutral":
        return "Consideration"
    elif emotional_tone == "Negative" or sentiment == "Negative":
        return "Decision"
    elif emotional_tone == "Mixed":
        return "Awareness"

    return "Awareness"

# Apply classify_stage
df["stage"] = df.apply(classify_stage, axis=1)

# Create label map
label_map = {label: i for i, label in enumerate(df["stage"].unique())}
print("Label Map:", label_map)

# Convert labels to numeric
df["stage_label"] = df["stage"].map(label_map)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["processed_text"], df["stage_label"], test_size=0.2, random_state=42
)

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)

# Train Logistic Regression Classifier
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, train_labels)

# Make predictions
y_pred = lr_model.predict(X_val)

# Evaluation metrics
print("\nClassification Report:")
print(classification_report(val_labels, y_pred, target_names=label_map.keys()))

print("\nConfusion Matrix:")
print(confusion_matrix(val_labels, y_pred))

# Overall metrics
overall_accuracy = accuracy_score(val_labels, y_pred)
overall_precision = precision_score(val_labels, y_pred, average="weighted")
overall_recall = recall_score(val_labels, y_pred, average="weighted")
overall_f1 = f1_score(val_labels, y_pred, average="weighted")

print("\nOverall Metrics:")
print(f"Accuracy: {overall_accuracy:.2f}")
print(f"Precision: {overall_precision:.2f}")
print(f"Recall: {overall_recall:.2f}")
print(f"F1-Score: {overall_f1:.2f}")


Label Map: {'Decision': 0, 'Post-Purchase': 1, 'Awareness': 2, 'Consideration': 3}

Classification Report:
               precision    recall  f1-score   support

     Decision       0.93      0.97      0.95        70
Post-Purchase       0.75      0.43      0.55        28
    Awareness       0.82      1.00      0.90        55
Consideration       0.00      0.00      0.00         3

     accuracy                           0.87       156
    macro avg       0.63      0.60      0.60       156
 weighted avg       0.84      0.87      0.84       156


Confusion Matrix:
[[68  1  1  0]
 [ 5 12 11  0]
 [ 0  0 55  0]
 [ 0  3  0  0]]

Overall Metrics:
Accuracy: 0.87
Precision: 0.84
Recall: 0.87
F1-Score: 0.84


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure correct device usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
file_path = "dataset_with_topic_labels.csv"  # Replace with your dataset path
df = pd.read_csv(file_path)

# Define the stage classification logic
def classify_stage(row):
    overlap = row["topics_overlap"]
    topic = row["topic_label"]
    sentiment = row["sentiment"]
    emotional_tone = row["final_consolidated_emotional_tone"]

    if overlap >= 0.2:
        return "Post-Purchase"
    if 0.1 <= overlap < 0.2:
        if topic in ["Connectivity & Portability", "Quality & Reviews"]:
            return "Consideration"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"
    if overlap < 0.1:
        if topic in ["Design & Usability", "Protection & Packaging"]:
            return "Awareness"
        if topic in ["Performance & Specifications", "Quality & Reviews"]:
            return "Decision"

    if emotional_tone == "Positive" and sentiment == "Positive":
        return "Post-Purchase"
    elif emotional_tone == "Neutral" or sentiment == "Neutral":
        return "Consideration"
    elif emotional_tone == "Negative" or sentiment == "Negative":
        return "Decision"
    elif emotional_tone == "Mixed":
        return "Awareness"

    return "Awareness"

# Apply classification logic
df["stage"] = df.apply(classify_stage, axis=1)

# Map labels to numerical values
label_map = {label: i for i, label in enumerate(df["stage"].unique())}
print("Label Map:", label_map)

# Split the dataset
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["processed_text"], df["stage"], test_size=0.2, random_state=42
)

# Convert labels to numerical values
train_labels = train_labels.map(label_map)
val_labels = val_labels.map(label_map)

# Load tokenizer and model
model_name = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add a padding token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("Added padding token '[PAD]' to tokenizer.")

# Load model and resize token embeddings
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_map)
).to(device)
model.resize_token_embeddings(len(tokenizer))

# Set `pad_token_id` in the model's configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Preprocess data
def preprocess_data(texts, labels, tokenizer, max_length=256):
    labels = labels.tolist()  # Ensure labels are a list
    encodings = tokenizer(
        list(texts),
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt",
    )
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": torch.tensor(labels, dtype=torch.long),
    }

train_data = preprocess_data(train_texts, train_labels, tokenizer)
val_data = preprocess_data(val_texts, val_labels, tokenizer)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

# Define evaluation metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Set batch size to 1 to avoid issues
    per_device_eval_batch_size=1,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_dir="./logs",
    learning_rate=2e-5,
    report_to="none",
    fp16=False,  # Avoid mixed precision for simplicity
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Train the model
print("Starting training...")
trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

# Analyze performance for a specific class (e.g., 'Awareness')
y_true = val_labels.tolist()
y_pred = trainer.predict(val_dataset).predictions.argmax(-1)

# Classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_map.keys()))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=label_map.keys(), yticklabels=label_map.keys())
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
operator torchvision::nms does not exist