In [4]:
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
import torch

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# Load the zero-shot classification pipeline
classifier = pipeline(
    "zero-shot-classification", 
    model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", device = device
)

# Define input file path and load the dataset
file_path = "data/reviews.csv"
df = pd.read_csv(file_path)

# Define column name containing reviews
review_column = "Review"

In [9]:
# Ensure the correct column exists
if review_column not in df.columns:
    raise ValueError(f"Column '{review_column}' not found in the dataset!")

# Define candidate labels and initialize output columns
candidate_labels = [
    "Talks about driving experience", 
    "Talks about features", 
    "Talks about value for money", 
    "Talks about issues", 
    "Other"
]
df["talks_about"] = ""
df["sentiment"] = ""

In [10]:
# Classification logic
def classify_review(review_text):
    output = classifier(review_text, candidate_labels, multi_label=False)
    category = output["labels"][0]  # The top label is the most likely category
    sentiment = "Positive" if output["scores"][0] > 0.5 else "Negative"
    return category, sentiment

In [11]:
# Apply classification with progress bar
print("Classifying reviews...")
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    category, sentiment = classify_review(row[review_column])
    df.at[idx, "talks_about"] = category
    df.at[idx, "sentiment"] = sentiment

Classifying reviews...


Processing:   0%|          | 0/5925 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing:   0%|          | 10/5925 [00:02<15:10,  6.50it/s] You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing: 100%|██████████| 5925/5925 [11:44<00:00,  8.41it/s]


In [12]:
# Save the classified data
output_file = "data/reviews_with_sentiments_deberta.csv"
df.to_csv(output_file, index=False)
print(f"Classification complete. Results saved to '{output_file}'.")

Classification complete. Results saved to 'data/reviews_with_sentiments_deberta.csv'.
