## Notes and next steps

This notebook performs sentiment analysis on Amazon product reviews from `Reviews.csv`.

- It provides both a rule-based sentiment score (VADER) and a transformer-based score (DistilBERT SST-2).
- By default the notebook processes a sample (to stay fast). Set `PROCESS_FULL = True` to run on the whole file (requires more time/memory).
- Outputs a CSV `reviews_sentiment_results.csv` with original columns plus sentiment columns.

If you need GPU acceleration or a different transformer model, change the `HF_MODEL` variable in the model cell.

In [1]:
# Setup: install required packages (uncomment to run in notebook environment)
# Note: for local execution on Windows PowerShell, run these commands in a terminal if the notebook cannot install packages directly.
# !pip install pandas numpy matplotlib seaborn tqdm nltk scikit-learn transformers sentencepiece torch vaderSentiment

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# NLP imports
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline

# Download NLTK data
nltk.download('punkt')
nltk_available = True

# Config
DATA_PATH = r"c:\Users\asiqi\OneDrive\Desktop\Collage\SEM_6\AI_Analysics\archive (3)\Reviews.csv"
OUTPUT_CSV = 'reviews_sentiment_results.csv'
PROCESS_FULL = False  # Set True to run on entire file (may be slow/high memory)
SAMPLE_SIZE = 2000  # when PROCESS_FULL=False, use this many random reviews
HF_MODEL = 'distilbert-base-uncased-finetuned-sst-2-english'

print('Notebook setup complete')

ModuleNotFoundError: No module named 'nltk'

In [None]:
# 1) Data loading (handles large files by reservoir sampling when PROCESS_FULL=False)
import random

def sample_reviews(path, sample_size=2000, usecols=None, chunksize=10000, seed=42):
    """Reservoir sample rows from a potentially large CSV file without loading entire file into memory.
    Returns a DataFrame of sampled rows (as strings by default).
    """
    rng = random.Random(seed)
    reservoir = []
    total = 0
    for chunk in pd.read_csv(path, usecols=usecols, chunksize=chunksize, iterator=True, encoding='utf-8', dtype=str, low_memory=False):
        for _, row in chunk.iterrows():
            total += 1
            if len(reservoir) < sample_size:
                reservoir.append(row)
            else:
                j = rng.randrange(total)
                if j < sample_size:
                    reservoir[j] = row
    if len(reservoir) == 0:
        return pd.DataFrame(columns=usecols)
    df = pd.DataFrame(reservoir)
    return df

# Which columns we'll use
USECOLS = ['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Score','Time','Summary','Text']

if PROCESS_FULL:
    print('Loading full dataset (this may take a long time)...')
    df = pd.read_csv(DATA_PATH, usecols=USECOLS, encoding='utf-8', low_memory=False)
else:
    print(f'Sampling {SAMPLE_SIZE} reviews from the dataset...')
    df = sample_reviews(DATA_PATH, sample_size=SAMPLE_SIZE, usecols=USECOLS)

print('Loaded rows:', len(df))

# quick peek
if len(df) > 0:
    display(df.head())
else:
    print('No rows loaded. Check DATA_PATH and USECOLS.')

In [None]:
# 2) Preprocessing
import re

def clean_text(s):
    if pd.isna(s):
        return ''
    # remove multiple whitespace and newlines
    s = str(s)
    s = re.sub(r"\s+", ' ', s).strip()
    return s

# create a unified 'review_text' field by combining Summary and Text
df['Summary'] = df['Summary'].fillna('').astype(str)
df['Text'] = df['Text'].fillna('').astype(str)
df['review_text'] = (df['Summary'] + '. ' + df['Text']).map(clean_text)

# drop empty text rows
before = len(df)
df = df[df['review_text'].str.len() > 0].reset_index(drop=True)
after = len(df)
print(f'Removed {before-after} empty reviews. Remaining {after} reviews.')

# show sample
if len(df) > 0:
    display(df[['Id','Score','review_text']].head())

# short sanity checks
print('Score value counts:')
print(df['Score'].value_counts().sort_index())

In [None]:
# 3) VADER sentiment (rule-based)
analyzer = SentimentIntensityAnalyzer()

def vader_scores(text):
    return analyzer.polarity_scores(text)

vader_out = df['review_text'].map(vader_scores)

vader_df = pd.DataFrame(list(vader_out))
df = pd.concat([df.reset_index(drop=True), vader_df.reset_index(drop=True)], axis=1)

# label from compound score

def compound_to_label(c):
    if c >= 0.05:
        return 'positive'
    elif c <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['vader_label'] = df['compound'].map(compound_to_label)
print('VADER labeling complete')
display(df[['Id','Score','compound','vader_label']].head())

In [None]:
# 4) Transformer-based sentiment (Hugging Face pipeline)
print('Loading HF pipeline (this will download model the first time)')
classifier = pipeline('sentiment-analysis', model=HF_MODEL, device=-1)

# batch predictions
batch_size = 32
labels = []
for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df['review_text'].iloc[i:i+batch_size].tolist()
    out = classifier(batch_texts)
    labels.extend(out)

# classifier returns list of dicts with 'label' and 'score'
label_df = pd.DataFrame(labels)
# map label names to 'positive'/'negative' if necessary
label_df['label'] = label_df['label'].str.lower().map(lambda s: 'positive' if 'pos' in s else ('negative' if 'neg' in s else s))
label_df = label_df.rename(columns={'label': 'hf_label', 'score': 'hf_score'})

# attach to df
label_df = label_df.reset_index(drop=True)
df = pd.concat([df.reset_index(drop=True), label_df.reset_index(drop=True)], axis=1)

print('Transformer sentiment complete')
display(df[['Id','Score','hf_label','hf_score']].head())

In [None]:
# 5) Evaluation & visualization
# Create a simple label from the numeric star Score (1-2 negative, 3 neutral, 4-5 positive)
def score_to_label(s):
    try:
        s = float(s)
    except Exception:
        return 'neutral'
    if s <= 2:
        return 'negative'
    elif s == 3:
        return 'neutral'
    else:
        return 'positive'

if 'Score' in df.columns:
    df['star_label'] = df['Score'].map(score_to_label)

# confusion counts
print('VADER vs Stars:')
print(pd.crosstab(df['star_label'], df['vader_label']))
print('\nHuggingFace vs Stars:')
print(pd.crosstab(df['star_label'], df['hf_label']))

# simple distribution plots
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.countplot(x='vader_label', data=df)
plt.title('VADER label counts')

plt.subplot(1,2,2)
sns.countplot(x='hf_label', data=df)
plt.title('HF label counts')
plt.tight_layout()
plt.show()

# show some disagreement examples
mismatch = df[df['vader_label'] != df['hf_label']].sample(n=min(5, max(0, len(df[df['vader_label'] != df['hf_label']]))))
if len(mismatch) > 0:
    display(mismatch[['Id','Score','review_text','vader_label','hf_label','compound','hf_score']])
else:
    print('No disagreements found in sample')

# Save results
save_path = OUTPUT_CSV
print('Saving results to', save_path)
df.to_csv(save_path, index=False)
print('Saved')

In [None]:
# Install VADER Sentiment if not already installed
!pip install vaderSentiment

# Import necessary libraries
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

# Load the reviews CSV file
# Assuming the CSV has a column named 'review' containing the customer feedback text.
# If your column name is different, replace 'review' with the actual column name.
df = pd.read_csv('reviews.csv')

# Display the first few rows to verify the data
print("First few rows of the dataset:")
print(df.head())

# Initialize the VADER sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

# Function to get sentiment scores and classify sentiment
def get_sentiment(text):
    scores = sia.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        return 'Positive', scores
    elif compound <= -0.05:
        return 'Negative', scores
    else:
        return 'Neutral', scores

# Apply sentiment analysis to each review
df[['sentiment', 'scores']] = df['review'].apply(lambda x: pd.Series(get_sentiment(x)))

# Extract individual scores into separate columns
df['negative'] = df['scores'].apply(lambda x: x['neg'])
df['neutral'] = df['scores'].apply(lambda x: x['neu'])
df['positive'] = df['scores'].apply(lambda x: x['pos'])
df['compound'] = df['scores'].apply(lambda x: x['compound'])

# Drop the temporary 'scores' column
df.drop('scores', axis=1, inplace=True)

# Display the updated dataframe with sentiment analysis
print("\nUpdated dataframe with sentiment analysis:")
print(df.head())

# Perform basic analysis
sentiment_counts = df['sentiment'].value_counts()
average_compound = df['compound'].mean()

print("\nSentiment Counts:")
print(sentiment_counts)
print(f"\nAverage Compound Score: {average_compound:.4f}")

# Interpretation of average compound score
if average_compound >= 0.05:
    print("Overall sentiment is Positive.")
elif average_compound <= -0.05:
    print("Overall sentiment is Negative.")
else:
    print("Overall sentiment is Neutral.")

# Visualize the sentiment distribution
plt.figure(figsize=(8, 6))
sentiment_counts.plot(kind='bar', color=['green', 'red', 'blue'])
plt.title('Distribution of Customer Sentiments')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=0)
plt.show()

# Visualize the average scores
avg_scores = df[['negative', 'neutral', 'positive']].mean()
plt.figure(figsize=(8, 6))
avg_scores.plot(kind='bar', color=['red', 'blue', 'green'])
plt.title('Average Sentiment Scores')
plt.xlabel('Score Type')
plt.ylabel('Average Score')
plt.xticks(rotation=0)
plt.ylim(0, 1)
plt.show()