In [33]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

# Load the data
file_path = "data/reviews.csv"
df = pd.read_csv(file_path)

# Correct column name
review_column = "Review"

In [34]:
# Load the model and tokenizer
bart_model_name = "facebook/bart-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(bart_model_name)
model = AutoModelForSequenceClassification.from_pretrained(bart_model_name)

# Define categories and sentiment
categories = [
    "Talks about driving experience", 
    "Talks about features", 
    "Talks about value for money", 
    "Talks about issues", 
    "Other"
]
sentiments = ["Positive", "Negative"]

In [40]:
# Define the device to CUDA & display it
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

# Print out all the available gpus from the system
print(f'Available GPUs: {torch.cuda.device_count()}')

Device: cuda
Available GPUs: 1


In [35]:
# Load the model and tokenizer
bart_model_name = "facebook/bart-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(bart_model_name)
model = AutoModelForSequenceClassification.from_pretrained(bart_model_name)

# Define categories and sentiment
categories = [
    "Talks about driving experience", 
    "Talks about features", 
    "Talks about value for money", 
    "Talks about issues", 
    "Other"
]
sentiments = ["Positive", "Negative"]

# Function to classify a review
def classify_review(review_text):
    inputs = tokenizer(review_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits
    category_idx = logits.softmax(dim=1).argmax().item()
    sentiment_idx = category_idx % len(sentiments)  # Alternate positive/negative
    return categories[category_idx], sentiments[sentiment_idx]

In [37]:
print(df.columns)

Index(['Review'], dtype='object')


In [38]:
# Add predictions to the DataFrame
df["talks_about"] = ""
df["sentiment"] = ""

# Add a progress bar using tqdm
print("Classifying reviews...")
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    category, sentiment = classify_review(row[review_column])
    df.at[idx, "talks_about"] = category
    df.at[idx, "sentiment"] = sentiment

Classifying reviews...


Processing: 100%|██████████| 5925/5925 [58:16<00:00,  1.69it/s]  


In [39]:
# Save the updated DataFrame
output_file = "data/reviews_with_sentiments.csv"
df.to_csv(output_file, index=False)
print(f"Classification complete. Results saved to '{output_file}'.")

Classification complete. Results saved to 'data/reviews_with_sentiments.csv'.
