In [8]:
import torch
from concurrent.futures import ThreadPoolExecutor
from transformers import pipeline
import pandas as pd

In [9]:
# Verify GPU Availability
available = torch.cuda.is_available()
print(available)  # Returns True if GPU is available
if available:
	print(torch.cuda.get_device_name(0))  # Check GPU model name

True
NVIDIA GeForce RTX 3050 Ti Laptop GPU


## Zero Shot Classification
Load the dataset

In [10]:
try:
    reviews_df = pd.read_csv("car_reviews.csv")
except FileNotFoundError:
    print("The file 'car_reviews.csv' was not found.")

Verify column names

In [11]:
print(reviews_df.columns)
print(reviews_df)

Index(['Review'], dtype='object')
                                                 Review
0     Bought 2017 Optima Hybrid in November 17. It w...
1      You get a lot for your money and great perfor...
2      This car is amazing and have no complaints. Y...
3     At 11k now in a lease for 39 months and it onl...
4     I've owned BMW, Lexus, Mercedes-Benz in the la...
...                                                 ...
5954  Kia did a great job with this all new car, buy...
5955   When purchasing the car, I read that the newl...
5956  The new designed Kia Rio is awesome. Much room...
5957  Everyone seems so hyped on having the latest &...
5958   Kia Rio is an excellent car to drive, it's co...

[5959 rows x 1 columns]


Clean column names

In [12]:
reviews_df.columns = reviews_df.columns.str.strip().str.lower()
print(reviews_df.columns)
reviews = reviews_df['review'].tolist()

Index(['review'], dtype='object')


Initialize the zero-shot-classification pipeline and Define categories

Note: This model is with 406 million parameters. It is very slow when running on the CPU. So we run the classification on the GPU if available.

In [13]:
device = 0 if available else -1  # 0 for GPU, -1 for CPU

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

    
print(classifier.device.type)

# Define categories
categories = [
    "talks about driving experience",
    "talks about features",
    "talks about value for money",
    "talks about issues",
    "other"
]

cuda


Apply the classificaion to each review

In [14]:
# Determine batch size dynamically (Example: 32 reviews per batch if on GPU)
if device == 0:  # GPU available
    batch_size = min(len(reviews), 2048)  # Use up to 2048 reviews per batch for GPU
else:  # CPU
    batch_size = min(len(reviews), 128)  # Limit batch size for CPU

# Function to process a single batch
def process_batch(batch):
    return classifier(batch, candidate_labels=categories)

# Process the reviews in parallel batches
results = []
with ThreadPoolExecutor() as executor:
    for i in range(0, len(reviews), batch_size):
        batch = reviews[i:i + batch_size]
        # Submit each batch to the thread pool
        future = executor.submit(process_batch, batch)
        # Extend results with the output
        results.extend(future.result())

# Extract the top category for each review
reviews_df['talks_about'] = [result['labels'][0] for result in results]

## Sentiment Analysis


Load the setiment analysis model

In [15]:
device = 0 if torch.cuda.is_available() else -1
sentiment_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To handle reviews longer than 512 tokens use truncate method

In [16]:
# Function to analyze sentiment with truncation
def analyze_sentiment(batch):
    return sentiment_classifier(batch, truncation=True)

# Convert reviews to a list
reviews = reviews_df['review'].tolist()

# Process reviews in batches
from concurrent.futures import ThreadPoolExecutor

batch_size = 128 if device == -1 else 512  # Adjust batch size based on system capacity
sentiments = []
with ThreadPoolExecutor() as executor:
    for i in range(0, len(reviews), batch_size):
        batch = reviews[i:i + batch_size]
        future = executor.submit(analyze_sentiment, batch)
        sentiments.extend(future.result())

# Add sentiment labels to the DataFrame
reviews_df['sentiment'] = [result['label'] for result in sentiments]


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Save the classified reviews to a new CSV file

In [17]:
reviews_df.to_csv('classified_sentiment_reviews.csv', index=False)