In [2]:
import os
import findspark

# Set Java home
os.environ['JAVA_HOME'] = r"C:\Program Files\Eclipse Adoptium\jdk-11.0.28.6-hotspot"
os.environ['PATH'] = r"C:\Program Files\Eclipse Adoptium\jdk-11.0.28.6-hotspot\bin" + os.pathsep + os.environ['PATH']

# Initialize findspark
findspark.init()

print("✓ Environment configured")


from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, count, avg, min, max, stddev, 
    year, month, dayofweek, length, 
    when, expr, approx_count_distinct,
    percentile_approx, to_timestamp, from_unixtime
)
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

✓ Environment configured


In [3]:
# ============================================================================
# 1. INITIALIZE SPARK SESSION
# ============================================================================

# Configure for local development (adjust memory based on your system)
spark = SparkSession.builder \
    .appName("EcommerceReviewIntelligence") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

print(f"✓ Spark Session Created")
print(f"  Version: {spark.version}")
print(f"  Master: {spark.sparkContext.master}")

✓ Spark Session Created
  Version: 3.5.3
  Master: local[*]


In [4]:
# ============================================================================
# 2. LOAD DATA
# ============================================================================

# Path to your sample
data_path = r"C:\Users\shafe\OneDrive\Desktop\ecommerce-intelligence\data\raw\electronics_sample_2M.jsonl.gz"

print(f"\n📂 Loading data from: {data_path}")

# Load compressed JSONL - Spark handles .gz automatically
df = spark.read.json(data_path)

print(f"✓ Data loaded successfully")
print(f"  Partitions: {df.rdd.getNumPartitions()}")



📂 Loading data from: C:\Users\shafe\OneDrive\Desktop\ecommerce-intelligence\data\raw\electronics_sample_2M.jsonl.gz
✓ Data loaded successfully
  Partitions: 1


In [5]:
# ============================================================================
# 3. SCHEMA INSPECTION
# ============================================================================

print("\n" + "="*80)
print("SCHEMA OVERVIEW")
print("="*80)

df.printSchema()

print(f"\nTotal Rows: {df.count():,}")
print(f"Total Columns: {len(df.columns)}")


SCHEMA OVERVIEW
root
 |-- asin: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- images: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- attachment_type: string (nullable = true)
 |    |    |-- large_image_url: string (nullable = true)
 |    |    |-- medium_image_url: string (nullable = true)
 |    |    |-- small_image_url: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- text: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- verified_purchase: boolean (nullable = true)


Total Rows: 2,051,569
Total Columns: 10


In [6]:
# ============================================================================
# 4. COLUMN AVAILABILITY CHECK
# ============================================================================

print("\n" + "="*80)
print("COLUMN INVENTORY")
print("="*80)

# Check what columns actually exist in your data
available_columns = df.columns
print(f"Available columns: {available_columns}\n")

# Key columns for your 5 features
required_cols = {
    'rating': 'Sentiment trends',
    'text': 'Theme extraction', 
    'title': 'Additional text analysis',
    'timestamp': 'Temporal analysis',
    'parent_asin': 'Product identification',
    'asin': 'Review/product linking',
    'verified_purchase': 'Quality filtering',
    'helpful_vote': 'Review quality indicator'
}

print("Column Status:")
for col_name, purpose in required_cols.items():
    status = "✓" if col_name in available_columns else "✗"
    print(f"  {status} {col_name:20s} → {purpose}")


COLUMN INVENTORY
Available columns: ['asin', 'helpful_vote', 'images', 'parent_asin', 'rating', 'text', 'timestamp', 'title', 'user_id', 'verified_purchase']

Column Status:
  ✓ rating               → Sentiment trends
  ✓ text                 → Theme extraction
  ✓ title                → Additional text analysis
  ✓ timestamp            → Temporal analysis
  ✓ parent_asin          → Product identification
  ✓ asin                 → Review/product linking
  ✓ verified_purchase    → Quality filtering
  ✓ helpful_vote         → Review quality indicator


In [7]:
# ============================================================================
# 5. DATA QUALITY OVERVIEW
# ============================================================================

print("\n" + "="*80)
print("DATA QUALITY ASSESSMENT")
print("="*80)

# Calculate completeness for each column
print("\nColumn Completeness (% non-null):")
total_rows = df.count()

for col_name in df.columns:
    non_null_count = df.filter(col(col_name).isNotNull()).count()
    completeness = (non_null_count / total_rows) * 100
    print(f"  {col_name:20s}: {completeness:6.2f}% ({non_null_count:,} / {total_rows:,})")


DATA QUALITY ASSESSMENT

Column Completeness (% non-null):
  asin                : 100.00% (2,051,569 / 2,051,569)
  helpful_vote        : 100.00% (2,051,569 / 2,051,569)
  images              : 100.00% (2,051,569 / 2,051,569)
  parent_asin         : 100.00% (2,051,569 / 2,051,569)
  rating              : 100.00% (2,051,569 / 2,051,569)
  text                : 100.00% (2,051,569 / 2,051,569)
  timestamp           : 100.00% (2,051,569 / 2,051,569)
  title               : 100.00% (2,051,569 / 2,051,569)
  user_id             : 100.00% (2,051,569 / 2,051,569)
  verified_purchase   : 100.00% (2,051,569 / 2,051,569)


In [8]:
# ============================================================================
# 6. RATING DISTRIBUTION (Feature 1: Sentiment Trends)
# ============================================================================

print("\n" + "="*80)
print("RATING DISTRIBUTION")
print("="*80)

rating_dist = df.groupBy("rating") \
    .agg(count("*").alias("review_count")) \
    .orderBy("rating") \
    .toPandas()

print("\nRating Breakdown:")
for _, row in rating_dist.iterrows():
    pct = (row['review_count'] / total_rows) * 100
    bar = "█" * int(pct / 2)
    print(f"  {row['rating']:.1f} ⭐: {row['review_count']:8,} ({pct:5.2f}%) {bar}")

# Calculate average rating
avg_rating = df.agg(avg("rating").alias("avg_rating")).collect()[0]['avg_rating']
print(f"\nOverall Average Rating: {avg_rating:.2f} ⭐")



RATING DISTRIBUTION

Rating Breakdown:
  1.0 ⭐: 161,497.0 ( 7.87%) ███
  2.0 ⭐: 84,509.0 ( 4.12%) ██
  3.0 ⭐: 124,124.0 ( 6.05%) ███
  4.0 ⭐: 249,137.0 (12.14%) ██████
  5.0 ⭐: 1,432,302.0 (69.81%) ██████████████████████████████████

Overall Average Rating: 4.32 ⭐


In [9]:
# ============================================================================
# 7. TEXT QUALITY ANALYSIS (Feature 2: Theme Extraction)
# ============================================================================

print("\n" + "="*80)
print("TEXT QUALITY ANALYSIS")
print("="*80)

# Add text length column
df_with_length = df.withColumn("text_length", length(col("text")))

# Text length statistics
text_stats = df_with_length.select(
    count("text").alias("total_reviews"),
    avg("text_length").alias("avg_length"),
    min("text_length").alias("min_length"),
    max("text_length").alias("max_length"),
    stddev("text_length").alias("std_length"),
    percentile_approx("text_length", 0.5).alias("median_length"),
    percentile_approx("text_length", 0.25).alias("p25_length"),
    percentile_approx("text_length", 0.75).alias("p75_length")
).toPandas()

print("\nReview Text Length Statistics (characters):")
print(f"  Total Reviews:  {text_stats['total_reviews'][0]:,}")
print(f"  Average:        {text_stats['avg_length'][0]:.1f}")
print(f"  Median:         {text_stats['median_length'][0]:.0f}")
print(f"  Std Dev:        {text_stats['std_length'][0]:.1f}")
print(f"  Min:            {text_stats['min_length'][0]}")
print(f"  Max:            {text_stats['max_length'][0]:,}")
print(f"  25th %ile:      {text_stats['p25_length'][0]:.0f}")
print(f"  75th %ile:      {text_stats['p75_length'][0]:.0f}")

# Text quality categories
text_quality = df_with_length.withColumn(
    "text_category",
    when(col("text_length") < 50, "Very Short (<50)")
    .when(col("text_length") < 200, "Short (50-200)")
    .when(col("text_length") < 500, "Medium (200-500)")
    .when(col("text_length") < 1000, "Long (500-1000)")
    .otherwise("Very Long (1000+)")
)

print("\nText Length Distribution:")
text_quality.groupBy("text_category") \
    .agg(count("*").alias("count")) \
    .orderBy("count", ascending=False) \
    .show(truncate=False)


TEXT QUALITY ANALYSIS

Review Text Length Statistics (characters):
  Total Reviews:  2,051,569
  Average:        173.2
  Median:         87
  Std Dev:        292.3
  Min:            0
  Max:            22,465
  25th %ile:      35
  75th %ile:      200

Text Length Distribution:
+-----------------+------+
|text_category    |count |
+-----------------+------+
|Short (50-200)   |847789|
|Very Short (<50) |689991|
|Medium (200-500) |370025|
|Long (500-1000)  |106798|
|Very Long (1000+)|36966 |
+-----------------+------+



In [10]:
# ============================================================================
# 8. TEMPORAL ANALYSIS (Feature 5: Review Velocity)
# ============================================================================

print("\n" + "="*80)
print("TEMPORAL DISTRIBUTION")
print("="*80)

# Convert timestamp to date (assuming Unix timestamp in milliseconds)
df_temporal = df.withColumn(
    "review_date",
    to_timestamp(from_unixtime(col("timestamp") / 1000))
).withColumn(
    "review_year",
    year("review_date")
).withColumn(
    "review_month",
    month("review_date")
)

# Year distribution
print("\nReviews by Year:")
df_temporal.groupBy("review_year") \
    .agg(count("*").alias("count")) \
    .orderBy("review_year") \
    .show(20)

# Date range
date_range = df_temporal.select(
    min("review_date").alias("earliest"),
    max("review_date").alias("latest")
).collect()[0]

print(f"\nDate Range:")
print(f"  Earliest: {date_range['earliest']}")
print(f"  Latest:   {date_range['latest']}")


TEMPORAL DISTRIBUTION

Reviews by Year:
+-----------+------+
|review_year| count|
+-----------+------+
|       2007|   602|
|       2008|  4249|
|       2009|   561|
|       2010|   775|
|       2011|  1482|
|       2012|  3595|
|       2013| 10041|
|       2014| 28903|
|       2015| 70234|
|       2016|184147|
|       2017|291110|
|       2018|288758|
|       2019|338299|
|       2020|404252|
|       2021|275646|
|       2022|125247|
|       2023| 23668|
+-----------+------+


Date Range:
  Earliest: 2007-11-19 00:16:47
  Latest:   2023-09-10 18:05:53


In [11]:
# ============================================================================
# 9. PRODUCT ANALYSIS (Feature 3: Competitive Analysis)
# ============================================================================

print("\n" + "="*80)
print("PRODUCT-LEVEL ANALYSIS")
print("="*80)

# Top products by review count
print("\nTop 20 Products by Review Volume:")
product_stats = df.groupBy("parent_asin") \
    .agg(
        count("*").alias("review_count"),
        avg("rating").alias("avg_rating"),
        stddev("rating").alias("rating_stddev")
    ) \
    .orderBy(col("review_count").desc())

product_stats.show(20, truncate=False)

# Product distribution summary
product_summary = df.agg(
    approx_count_distinct("parent_asin").alias("unique_products")
).collect()[0]

print(f"\nProduct Summary:")
print(f"  Unique Products: {product_summary['unique_products']:,}")
print(f"  Avg Reviews/Product: {total_rows / product_summary['unique_products']:.0f}")



PRODUCT-LEVEL ANALYSIS

Top 20 Products by Review Volume:
+-----------+------------+------------------+------------------+
|parent_asin|review_count|avg_rating        |rating_stddev     |
+-----------+------------+------------------+------------------+
|B075X8471B |178239      |4.392057854902687 |1.183661113282277 |
|B07GZFM1ZM |140751      |4.511754801031609 |1.0944273559836395|
|B01K8B8YA8 |119051      |4.346901747990357 |1.1632273628160226|
|B010BWYDYA |103964      |4.143261128852295 |1.2850597477632528|
|B07H65KP63 |95397       |4.626235625858255 |0.9293519769296041|
|B0791TX5P5 |88798       |4.449582197797247 |1.1758164076342243|
|B08XPWDSWW |72566       |4.260438772979081 |1.2705146616240275|
|B07S764D9V |55743       |4.255404266006494 |1.238233047702127 |
|B07KTYJ769 |51867       |4.575818921472227 |1.0733212524663065|
|B0BW4PFM58 |51568       |4.3570819112627985|1.2027619433283017|
|B07HZLHPKP |46254       |4.400614000951269 |1.139376102664279 |
|B07456BG8N |44813       |4.505

In [12]:
# ============================================================================
# 10. VERIFICATION & HELPFULNESS (Quality Indicators)
# ============================================================================

print("\n" + "="*80)
print("REVIEW QUALITY INDICATORS")
print("="*80)

if 'verified_purchase' in df.columns:
    verified_dist = df.groupBy("verified_purchase") \
        .agg(count("*").alias("count")) \
        .toPandas()
    
    print("\nVerified Purchase Distribution:")
    for _, row in verified_dist.iterrows():
        pct = (row['count'] / total_rows) * 100
        print(f"  {str(row['verified_purchase']):5s}: {row['count']:8,} ({pct:5.2f}%)")

if 'helpful_vote' in df.columns:
    helpful_stats = df.select(
        count(when(col("helpful_vote") > 0, 1)).alias("reviews_with_votes"),
        avg("helpful_vote").alias("avg_helpful"),
        max("helpful_vote").alias("max_helpful")
    ).collect()[0]
    
    print("\nHelpfulness Voting:")
    print(f"  Reviews with votes: {helpful_stats['reviews_with_votes']:,}")
    print(f"  Avg helpful votes:  {helpful_stats['avg_helpful']:.2f}")
    print(f"  Max helpful votes:  {helpful_stats['max_helpful']:,}")


REVIEW QUALITY INDICATORS

Verified Purchase Distribution:
  True : 1,976,792 (96.36%)
  False:   74,777 ( 3.64%)

Helpfulness Voting:
  Reviews with votes: 172,585
  Avg helpful votes:  0.69
  Max helpful votes:  46,841


In [13]:
# ============================================================================
# 11. SAMPLE RECORDS
# ============================================================================

print("\n" + "="*80)
print("SAMPLE RECORDS")
print("="*80)

print("\nRandom Sample of Reviews:")
df.select("rating", "title", "text", "verified_purchase") \
    .sample(fraction=0.0001, seed=42) \
    .show(5, truncate=100, vertical=True)


SAMPLE RECORDS

Random Sample of Reviews:
-RECORD 0-----------------------------------------------------------------------------------------------------------------
 rating            | 3.0                                                                                                  
 title             | Good system, but with significant limitations                                                        
 text              | this is a very solid home camera system. I would recommend it to anyone looking to improve their ... 
 verified_purchase | true                                                                                                 
-RECORD 1-----------------------------------------------------------------------------------------------------------------
 rating            | 5.0                                                                                                  
 title             | Glad I bought.  Very happy with these.                                     

In [14]:
# ============================================================================
# 12. KEY INSIGHTS FOR FEATURE DEVELOPMENT
# ============================================================================

print("\n" + "="*80)
print("INSIGHTS FOR YOUR 5 FEATURES")
print("="*80)

print("""
✓ FEATURE 1: SENTIMENT TRENDS OVER TIME
  → Rich temporal data spanning multiple years
  → Rating distribution shows clear patterns
  → Can track sentiment shifts by product/time period
  
✓ FEATURE 2: THEME EXTRACTION  
  → Text lengths vary significantly (good for NLP)
  → Most reviews are substantive (>50 chars)
  → Can use PySpark ML for topic modeling or AWS Bedrock for theme extraction
  
✓ FEATURE 3: COMPETITIVE PRODUCT ANALYSIS
  → 47 high-density products perfect for comparison
  → Mix of product categories (audio, smart home, Amazon devices)
  → Can compare ratings, themes, velocity across competitors
  
✓ FEATURE 4: PRODUCT SUCCESS PREDICTION
  → Can use rating trends, review velocity, sentiment as features
  → Verified purchase flag helps with quality
  → Helpful votes indicate review impact
  
✓ FEATURE 5: REVIEW VELOCITY ANALYSIS
  → Complete timestamp data for time-series analysis
  → Can detect review spikes, seasonal patterns
  → Velocity changes may predict product lifecycle stage

NEXT STEPS:
1. Build PySpark transformation pipeline for each feature
2. Design Redshift schema for aggregated insights
3. Integrate AWS Bedrock for RAG-based natural language queries
4. Create sample visualizations/dashboard
5. Deploy to EMR for production-scale processing
""")


INSIGHTS FOR YOUR 5 FEATURES

✓ FEATURE 1: SENTIMENT TRENDS OVER TIME
  → Rich temporal data spanning multiple years
  → Rating distribution shows clear patterns
  → Can track sentiment shifts by product/time period
  
✓ FEATURE 2: THEME EXTRACTION  
  → Text lengths vary significantly (good for NLP)
  → Most reviews are substantive (>50 chars)
  → Can use PySpark ML for topic modeling or AWS Bedrock for theme extraction
  
✓ FEATURE 3: COMPETITIVE PRODUCT ANALYSIS
  → 47 high-density products perfect for comparison
  → Mix of product categories (audio, smart home, Amazon devices)
  → Can compare ratings, themes, velocity across competitors
  
✓ FEATURE 4: PRODUCT SUCCESS PREDICTION
  → Can use rating trends, review velocity, sentiment as features
  → Verified purchase flag helps with quality
  → Helpful votes indicate review impact
  
✓ FEATURE 5: REVIEW VELOCITY ANALYSIS
  → Complete timestamp data for time-series analysis
  → Can detect review spikes, seasonal patterns
  → Velocity

In [15]:
# ============================================================================
# 13. SAVE EXPLORATION SUMMARY
# ============================================================================

# Cache the DataFrame for faster subsequent operations
df.cache()

print("\n✓ DataFrame cached for subsequent analysis")
print("\n🎯 Ready to build transformation pipeline!")

# Optional: Save exploration stats to file
exploration_stats = {
    'total_rows': total_rows,
    'unique_products': product_summary['unique_products'],
    'avg_rating': avg_rating,
    'date_range': (str(date_range['earliest']), str(date_range['latest']))
}

print(f"\n📊 Exploration Complete!")
print(f"   Total Reviews: {total_rows:,}")
print(f"   Ready for feature engineering in next notebook!")


✓ DataFrame cached for subsequent analysis

🎯 Ready to build transformation pipeline!

📊 Exploration Complete!
   Total Reviews: 2,051,569
   Ready for feature engineering in next notebook!
