In [None]:
import torch
from concurrent.futures import ThreadPoolExecutor
from transformers import pipeline
import pandas as pd

In [None]:
# Verify GPU Availability
available = torch.cuda.is_available()
print(available)  # Returns True if GPU is available
if available:
	print(torch.cuda.get_device_name(0))  # Check GPU model name

## Zero Shot Classification
Load the dataset

In [None]:
try:
    reviews_df = pd.read_csv("car_reviews.csv")
except FileNotFoundError:
    print("The file 'car_reviews.csv' was not found.")

Verify column names

In [None]:
print(reviews_df.columns)
print(reviews_df)

Clean column names

In [None]:
reviews_df.columns = reviews_df.columns.str.strip().str.lower()
print(reviews_df.columns)
reviews = reviews_df['review'].tolist()

Initialize the zero-shot-classification pipeline and Define categories

Note: This model is with 406 million parameters. It is very slow when running on the CPU. So we run the classification on the GPU if available.

In [None]:
device = 0 if available else -1  # 0 for GPU, -1 for CPU

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

    
print(classifier.device.type)

# Define categories
categories = [
    "talks about driving experience",
    "talks about features",
    "talks about value for money",
    "talks about issues",
    "other"
]

Apply the classificaion to each review

In [None]:
# Determine batch size dynamically (Example: 32 reviews per batch if on GPU)
if device == 0:  # GPU available
    batch_size = min(len(reviews), 2048)  # Use up to 2048 reviews per batch for GPU
else:  # CPU
    batch_size = min(len(reviews), 128)  # Limit batch size for CPU

# Function to process a single batch
def process_batch(batch):
    return classifier(batch, candidate_labels=categories)

# Process the reviews in parallel batches
results = []
with ThreadPoolExecutor() as executor:
    for i in range(0, len(reviews), batch_size):
        batch = reviews[i:i + batch_size]
        # Submit each batch to the thread pool
        future = executor.submit(process_batch, batch)
        # Extend results with the output
        results.extend(future.result())

# Extract the top category for each review
reviews_df['talks_about'] = [result['labels'][0] for result in results]

Save the classified reviews to a new CSV file

In [None]:
reviews_df.to_csv('intermediate_reviews_classified.csv', index=False)