
Product Review Classification for E-commerce
---



In [None]:
from datasets import load_dataset

ds = load_dataset("m-ric/amazon_product_reviews_datafiniti")


In [None]:
print("Dataset structure:\n", ds)
print("\nFeatures of the training split:\n", ds['train'].features)
print("\nFirst 5 rows of the training split:\n", ds['train'][:5])

Missing values

In [None]:
import pandas as pd

# Convert the 'train' split to a pandas DataFrame
df_train = ds['train'].to_pandas()

# Calculate the number of missing values for each column
missing_values_count = df_train.isnull().sum()

# Calculate the percentage of missing values for each column
total_rows = len(df_train)
missing_values_percentage = (missing_values_count / total_rows) * 100

# Create a DataFrame to display the missing values information
missing_info = pd.DataFrame({
    'Missing Count': missing_values_count,
    'Missing Percentage': missing_values_percentage
})

# Filter to show only columns with missing values (optional, but good for clarity)
missing_info = missing_info[missing_info['Missing Count'] > 0]

print("Missing Values Information (Count and Percentage):\n", missing_info)

In [None]:
df_train['reviews.numHelpful'] = df_train['reviews.numHelpful'].fillna(0)

print("Missing values in 'reviews.numHelpful' after imputation:", df_train['reviews.numHelpful'].isnull().sum())

In [None]:
print("DataFrame Info:")
df_train.info()

print("\nDataFrame dtypes:")
print(df_train.dtypes)

In [None]:
print("Descriptive statistics for 'reviews.numHelpful':\n", df_train['reviews.numHelpful'].describe())

print("\nUnique values and counts for 'brand':\n", df_train['brand'].value_counts())

print("\nUnique values and counts for 'primaryCategories':\n", df_train['primaryCategories'].value_counts())

print("\nUnique values and counts for 'reviews.rating':\n", df_train['reviews.rating'].value_counts())

In [None]:
category_counts = df_train['primaryCategories'].value_counts()
unique_categories = list(category_counts.keys())

# Create a new column for each unique category, indicating its presence
for category in unique_categories:
    df_train[category] = df_train['primaryCategories'].apply(lambda x: 1 if category in x else 0)

print("First 5 rows with new category columns:")
print(df_train[['primaryCategories'] + unique_categories].head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.countplot(x='reviews.rating', data=df_train, palette='viridis', hue='reviews.rating', legend=False)
plt.title('Distribution of Review Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [None]:
category_counts_df = df_train[unique_categories].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 7))
sns.barplot(x=category_counts_df.index, y=category_counts_df.values, palette='coolwarm', hue=category_counts_df.index, legend=False)
plt.title('Distribution of Product Categories')
plt.xlabel('Category')
plt.ylabel('Number of Products')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
from transformers import pipeline

sentiment_analyzer = pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment')

print("Pre-trained multilingual sentiment analysis model loaded successfully.")

In [None]:
from datasets import load_dataset

ds = load_dataset("m-ric/amazon_product_reviews_datafiniti")

In [None]:
import pandas as pd
import re

def clean_text(text):
    text = str(text).lower()  # Convert to string and lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with single space and strip whitespace
    return text

def map_rating_to_sentiment(rating):
    if rating in [1, 2]:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    elif rating in [4, 5]:
        return 'positive'
    return None

# Re-apply text cleaning and sentiment mapping to the ds object to ensure 'cleaned_text' and 'sentiment' exist
for split in ds.keys():
    ds[split] = ds[split].map(lambda x: {'cleaned_text': clean_text(x['reviews.text'])})
    ds[split] = ds[split].map(lambda x: {'sentiment': map_rating_to_sentiment(x['reviews.rating'])})

# Recreate df_train from the updated ds['train'] to include 'cleaned_text' and 'sentiment'
df_train = ds['train'].to_pandas()

batch_size = 32 # Adjust batch size based on available memory
predicted_sentiments = []

# Process texts in batches
for i in range(0, len(df_train), batch_size):
    batch_texts = df_train['cleaned_text'][i:i+batch_size].tolist()
    if not batch_texts: # Skip empty batches
        continue
    # Ensure texts are strings, handling potential None or non-string values
    batch_texts = [str(text) if text is not None else '' for text in batch_texts]

    predictions = sentiment_analyzer(batch_texts, truncation=True)
    predicted_sentiments.extend([pred['label'] for pred in predictions])

# Map the model's star ratings to our sentiment labels
def map_model_output_to_sentiment(label):
    if label in ['1 star', '2 stars']:
        return 'negative'
    elif label == '3 stars':
        return 'neutral'
    elif label in ['4 stars', '5 stars']:
        return 'positive'
    return None

df_train['predicted_sentiment'] = [map_model_output_to_sentiment(label) for label in predicted_sentiments]

print("Sentiment predictions generated and mapped to 'predicted_sentiment' column.")
print("First 5 predicted sentiments:")
print(df_train['predicted_sentiment'].head())

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Ensure both columns are not null for evaluation
df_eval_actual = df_train[df_train['sentiment'].notna() & df_train['predicted_sentiment'].notna()]

# Calculate Accuracy Score
accuracy_actual = accuracy_score(df_eval_actual['sentiment'], df_eval_actual['predicted_sentiment'])

# Calculate F1-score (weighted to account for class imbalance)
f1_actual = f1_score(df_eval_actual['sentiment'], df_eval_actual['predicted_sentiment'], average='weighted')

print(f"Actual Model Accuracy: {accuracy_actual:.4f}")
print(f"Actual Model F1-score (weighted): {f1_actual:.4f}")

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Define the unique sentiment labels for consistent ordering
labels = ['positive', 'neutral', 'negative']

# Generate the confusion matrix
cm_actual = confusion_matrix(df_eval_actual['sentiment'], df_eval_actual['predicted_sentiment'], labels=labels)

print("Actual Confusion Matrix:")
print(cm_actual)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_actual, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Actual Confusion Matrix')
plt.xlabel('Predicted Sentiment')
plt.ylabel('True Sentiment')
plt.show()



### Data Analysis Key Findings
*   The pre-trained sentiment analysis model successfully generated predictions after resolving initial dependency and long sequence handling issues.
*   The model achieved an accuracy of **0.8133** on the `df_train` dataset.
*   The weighted F1-score for the model was **0.8255**, indicating good overall performance across sentiment classes.
*   The confusion matrix revealed the following:
    *   **3917** positive reviews were correctly classified as positive.
    *   **282** neutral reviews were correctly classified as neutral.
    *   **681** negative reviews were correctly classified as negative.
*   Significant misclassifications occurred, particularly with positive reviews being misclassified as neutral (475 instances) or negative (173 instances).
*   Neutral reviews showed substantial misclassification, with 170 instances predicted as positive and 175 instances predicted as negative, suggesting the model struggles to distinguish neutral sentiment from positive or negative.

### Insights or Next Steps
*   Further investigation into the misclassified neutral reviews could reveal patterns or specific linguistic cues that the model struggles to interpret, potentially informing fine-tuning strategies or feature engineering.
*   Analyzing the examples where positive sentiment was misclassified as neutral or negative could help refine the sentiment mapping rules or improve the model's ability to discern nuanced positive expressions.
