In [58]:
import os
import bz2
import kagglehub
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
# 1. Load the model (this can take a moment)
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

Loading sentiment model...
Model loaded.


In [60]:
path = kagglehub.dataset_download("bittlingmayer/amazonreviews")
train_file = os.path.join(path, 'train.ft.txt')
compressed_file = os.path.join(path, 'train.ft.txt.bz2')


In [None]:
if not os.path.exists(train_file):
    
    with bz2.open(compressed_file, 'rb') as f_in, open(train_file, 'wb') as f_out:
        f_out.write(f_in.read())

In [None]:
reviews_data = []
with open(train_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 1000:  # Stop after 1000 reviews
            break
        # The line format is: __label__X Review text...
        label = int(line[9]) # Directly grab the rating number
        review = line[11:].strip()
        reviews_data.append({'rating': label, 'review': review})

Reading a sample of reviews...


In [63]:
df = pd.DataFrame(reviews_data)

In [64]:
def get_sentiment(text):
    # Truncate text to fit the model's max length
    tokens = tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=512)
    result = model(tokens)
    # The model outputs 0-4, so we add 1 for a 1-5 star rating
    return torch.argmax(result.logits).item() + 1

In [65]:
def map_to_binary(score):
    if score <= 2:  # 1 and 2 stars are negative
        return 1
    elif score >= 4: # 4 and 5 stars are positive
        return 2
    else: # 3 stars is neutral, which we'll treat as neither class.
          # We can map it to 0 or 3 so it's always wrong in accuracy check.
        return 3

In [None]:
df['sentiment'] = df['review'].apply(get_sentiment)

Analyzing sentiment... (this may take a minute)


In [None]:
print(df.head())


Analysis complete. Here are the first few results:
   rating                                             review  sentiment
0       2  Stuning even for the non-gamer: This sound tra...          5
1       2  The best soundtrack ever to anything.: I'm rea...          5
2       2  Amazing!: This soundtrack is my favorite music...          5
3       2  Excellent Soundtrack: I truly like this soundt...          5
4       2  Remember, Pull Your Jaw Off The Floor After He...          5


In [68]:
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


In [69]:
test_file = os.path.join(path, 'test.ft.txt')
compressed_test_file = os.path.join(path, 'test.ft.txt.bz2')

In [70]:
if not os.path.exists(test_file):
    print("Decompressing test dataset...")
    with bz2.open(compressed_test_file, 'rb') as f_in, open(test_file, 'wb') as f_out:
        f_out.write(f_in.read())

In [71]:
print("Reading a sample of test reviews...")
test_data = []
with open(test_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 1000:  # Using a sample of 1000 for speed
            break
        # True label is 1 (negative) or 2 (positive)
        true_label = int(line[9])
        review = line[11:].strip()
        test_data.append({'true_label': true_label, 'review': review})

test_df = pd.DataFrame(test_data)


Reading a sample of test reviews...


In [None]:
print("Running model on test data...")
test_df['predicted_5_star'] = test_df['review'].apply(get_sentiment)

Running model on test data...


In [73]:
def map_to_binary(score):
    if score <= 2: return 1  # Negative
    if score >= 4: return 2  # Positive
    return 3 

In [None]:
test_df['predicted_binary'] = test_df['predicted_5_star'].apply(map_to_binary)

In [None]:
accuracy = accuracy_score(test_df['true_label'], test_df['predicted_binary'])
print(f"Model Accuracy on Binary Task: {accuracy:.2%}")


--- Model Evaluation (Corrected) ---
Model Accuracy on Binary Task: 88.50%
