In [5]:
import pandas as pd

# Adjust path if your file name is different
FILE_PATH = "data/1429_1.csv"   # or "data/1429_1.csv" etc.

df = pd.read_csv(FILE_PATH, low_memory=False)

# Select only the columns we care about
keep_cols = [
    'reviews.rating',
    'reviews.text',
    'reviews.title',
    'name',               # product name
    'reviews.username',
    'reviews.date'
]
df = df[keep_cols].copy()

# Rename for cleaner code
df = df.rename(columns={
    'reviews.rating': 'rating',
    'reviews.text': 'text',
    'reviews.title': 'title',
    'name': 'product_name'
})

print("Shape after selecting columns:", df.shape)
print("\nMissing values:\n", df.isnull().sum())

# Handle missing values
# Drop rows where rating is missing (we need it for labels)
df = df.dropna(subset=['rating'])

# For text: fill missing with empty string (or drop if you prefer)
df['text'] = df['text'].fillna('')
df['title'] = df['title'].fillna('')

# Combine title + text (title often contains important sentiment words)
df['full_review'] = df['title'] + " " + df['text']
df['full_review'] = df['full_review'].str.strip()

# Drop empty reviews
df = df[df['full_review'].str.len() > 10]  # remove very short/empty

print("\nShape after cleaning:", df.shape)

# Create sentiment labels (classic 3-class)
def rating_to_sentiment(r):
    if r >= 4:
        return 'positive'
    elif r == 3:
        return 'neutral'
    else:
        return 'negative'

df['sentiment'] = df['rating'].apply(rating_to_sentiment)

# Alternative: binary (positive vs negative, ignoring neutral)
# df['sentiment_binary'] = df['rating'].apply(lambda r: 'positive' if r >= 4 else 'negative')

print("\nSentiment distribution:")
print(df['sentiment'].value_counts(normalize=True) * 100)

print("\nSample cleaned data:")
print(df[['rating', 'sentiment', 'full_review']].head(8).to_string(index=False))

# Save cleaned version for later use
df.to_csv("data/cleaned_reviews.csv", index=False)
print("\nCleaned data saved to data/cleaned_reviews.csv")

Shape after selecting columns: (34660, 6)

Missing values:
 rating                33
text                   1
title                  6
product_name        6760
reviews.username       7
reviews.date          39
dtype: int64

Shape after cleaning: (34626, 7)

Sentiment distribution:
sentiment
positive    93.325825
neutral      4.329117
negative     2.345059
Name: proportion, dtype: float64

Sample cleaned data:
 rating sentiment                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [4]:
import os
print("Current working directory:", os.getcwd())
print("Does file exist?", os.path.exists("data/1429_1.csv"))

import pandas as pd
df = pd.read_csv("data/1429_1.csv", low_memory=False)
print(df.shape)
print(df.columns.tolist())

Current working directory: c:\Users\Pranav\sentiment-review-app\notebooks
Does file exist? True
(34660, 21)
['id', 'name', 'asins', 'brand', 'categories', 'keys', 'manufacturer', 'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen', 'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id', 'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs', 'reviews.text', 'reviews.title', 'reviews.userCity', 'reviews.userProvince', 'reviews.username']
