In [None]:
import torch
from concurrent.futures import ThreadPoolExecutor
from transformers import pipeline
import pandas as pd
import matplotlib.pyplot as plt

Verify GPU Availability

In [None]:
available = torch.cuda.is_available()
print(available)  # Returns True if GPU is available
if available:
	print(torch.cuda.get_device_name(0))  # Check GPU model name
device = 0 if available else -1  # 0 for GPU, -1 for CPU

Load the dataset

In [None]:
try:
    reviews_df = pd.read_csv("car_reviews.csv")
except FileNotFoundError:
    print("The file 'car_reviews.csv' was not found.")

Verify column names

In [None]:
print(reviews_df.columns)
print(reviews_df)

Clean column names

In [None]:
reviews_df.columns = reviews_df.columns.str.strip().str.lower()
print(reviews_df.columns)
reviews = reviews_df['review'].tolist()

## Zero Shot Classification

Initialize the zero-shot-classification pipeline and Define categories

Note: This model is with 406 million parameters. It is very slow when running on the CPU. So we run the classification on the GPU if available.

In [None]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

    
print(classifier.device.type)

# Define categories
categories = [
    "talks about driving experience",
    "talks about features",
    "talks about value for money",
    "talks about issues",
    "other"
]

Apply the classificaion to each review

In [None]:
# Determine batch size dynamically (Example: 32 reviews per batch if on GPU)
if device == 0:  # GPU available
    batch_size = min(len(reviews), 1024)  # Use up to 1024 reviews per batch for GPU
else:  # CPU
    batch_size = min(len(reviews), 32)  # Limit batch size for CPU

# Function to process a single batch
def process_batch(batch):
    return classifier(batch, candidate_labels=categories)

# Process the reviews in parallel batches
results = []
with ThreadPoolExecutor() as executor:
    for i in range(0, len(reviews), batch_size):
        batch = reviews[i:i + batch_size]
        # Submit each batch to the thread pool
        future = executor.submit(process_batch, batch)
        # Extend results with the output
        results.extend(future.result())

# Extract the top category for each review
reviews_df['talks_about'] = [result['labels'][0] for result in results]

## Sentiment Analysis


Load the setiment analysis model

In [None]:
sentiment_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)

To handle reviews longer than 512 tokens use truncate method

In [None]:
# Function to analyze sentiment with truncation
def analyze_sentiment(batch):
    return sentiment_classifier(batch, truncation=True)

# Convert reviews to a list
reviews = reviews_df['review'].tolist()

# Process reviews in batches
from concurrent.futures import ThreadPoolExecutor

batch_size = 128 if device == -1 else 512  # Adjust batch size based on system capacity
sentiments = []
with ThreadPoolExecutor() as executor:
    for i in range(0, len(reviews), batch_size):
        batch = reviews[i:i + batch_size]
        future = executor.submit(analyze_sentiment, batch)
        sentiments.extend(future.result())

# Add sentiment labels to the DataFrame
reviews_df['sentiment'] = [result['label'] for result in sentiments]


Save the classified reviews to a new CSV file

In [None]:
reviews_df.to_csv('classified_sentiment_reviews.csv', index=False)

# Visualization
Visualization of sentiment spread

In [None]:
# Load the Classified reviews CSV file with sentiments
reviews_df = pd.read_csv('classified_sentiment_reviews.csv')

# Check if 'sentiment' column exists
if 'sentiment' in reviews_df.columns:
    sentiment_counts = reviews_df['sentiment'].value_counts()

    # Plot sentiment spread as a bar chart 
    plt.figure(figsize=(8, 6)) 
    sentiment_counts.plot(kind='bar', color=['skyblue', 'orange']) 
    plt.title('Sentiment Spread', fontsize=16) 
    plt.xlabel('Sentiment', fontsize=12) 
    plt.ylabel('Count', fontsize=12) 
    plt.xticks(rotation=0) 
    plt.tight_layout() 
    plt.show()
else:
    print("The 'sentiment' column does not exist in the CSV file.")

Visualization the spread of the review category

In [None]:
# Check if 'talks_about' column exists
if 'talks_about' in reviews_df.columns:
    talks_about_counts = reviews_df['talks_about'].value_counts()

    # Plot 'talks_about' categories as a bar chart
    plt.figure(figsize=(10, 6))
    talks_about_counts.plot(kind='bar', color='purple','skyblue',)
    plt.title('Talks About Categories Spread', fontsize=16)
    plt.xlabel('Talks About Category', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("The 'talks_about' column does not exist in the CSV file.")