In [2]:
import pandas as pd
import json
import gzip
from pathlib import Path

# Correct file paths with .gz extension
reviews_file = r"C:\Users\shafe\OneDrive\Desktop\ecommerce-intelligence\data\raw\Electronics.jsonl.gz"
meta_file = r"C:\Users\shafe\OneDrive\Desktop\ecommerce-intelligence\data\raw\meta_Electronics.jsonl.gz"

# Verify files exist
print("Checking files...")
print(f"Reviews file exists: {Path(reviews_file).exists()}")
print(f"Metadata file exists: {Path(meta_file).exists()}")

print(f"\nReviews file size: {Path(reviews_file).stat().st_size / (1024**3):.2f} GB")
print(f"Metadata file size: {Path(meta_file).stat().st_size / (1024**3):.2f} GB")

# Load first few lines to understand structure
print("\n" + "="*60)
print("SAMPLE REVIEW:")
print("="*60)

with gzip.open(reviews_file, 'rt', encoding='utf-8') as f:
    for i in range(3):
        line = f.readline()
        review = json.loads(line)
        print(f"\nReview {i+1}:")
        for key, value in review.items():
            if isinstance(value, str) and len(value) > 100:
                print(f"  {key}: {value[:100]}...")
            else:
                print(f"  {key}: {value}")

print("\n" + "="*60)
print("SAMPLE METADATA:")
print("="*60)

with gzip.open(meta_file, 'rt', encoding='utf-8') as f:
    for i in range(2):
        line = f.readline()
        meta = json.loads(line)
        print(f"\nProduct {i+1}:")
        for key, value in meta.items():
            if isinstance(value, (list, dict)):
                print(f"  {key}: {type(value).__name__} with {len(value)} items")
            elif isinstance(value, str) and len(value) > 100:
                print(f"  {key}: {value[:100]}...")
            else:
                print(f"  {key}: {value}")

Checking files...
Reviews file exists: True
Metadata file exists: True

Reviews file size: 6.03 GB
Metadata file size: 1.22 GB

SAMPLE REVIEW:

Review 1:
  rating: 3.0
  title: Smells like gasoline! Going back!
  text: First & most offensive: they reek of gasoline so if you are sensitive/allergic to petroleum products...
  images: [{'small_image_url': 'https://m.media-amazon.com/images/I/71YN+Qk3kCL._SL256_.jpg', 'medium_image_url': 'https://m.media-amazon.com/images/I/71YN+Qk3kCL._SL800_.jpg', 'large_image_url': 'https://m.media-amazon.com/images/I/71YN+Qk3kCL._SL1600_.jpg', 'attachment_type': 'IMAGE'}]
  asin: B083NRGZMM
  parent_asin: B083NRGZMM
  user_id: AFKZENTNBQ7A7V7UXW5JJI6UGRYQ
  timestamp: 1658185117948
  helpful_vote: 0
  verified_purchase: True

Review 2:
  rating: 1.0
  title: Didn’t work at all lenses loose/broken.
  text: These didn’t work. Idk if they were damaged in shipping or what, but the lenses were loose or someth...
  images: []
  asin: B07N69T6TM
  parent_asin:

In [3]:
import gzip
import json

print("Counting total records (this will take a few minutes)...\n")

# Count reviews
review_count = 0
with gzip.open(reviews_file, 'rt', encoding='utf-8') as f:
    for line in f:
        review_count += 1
        if review_count % 1000000 == 0:
            print(f"  Reviews: {review_count / 1000000:.1f}M counted...")

print(f"\n✓ Total reviews: {review_count:,}")

# Count metadata
meta_count = 0
with gzip.open(meta_file, 'rt', encoding='utf-8') as f:
    for line in f:
        meta_count += 1

print(f"✓ Total products: {meta_count:,}")

# This matches the website stats
print(f"\nDataset: Electronics")
print(f"  - {review_count / 1000000:.1f}M reviews")
print(f"  - {meta_count / 1000000:.2f}M products")

Counting total records (this will take a few minutes)...

  Reviews: 1.0M counted...
  Reviews: 2.0M counted...
  Reviews: 3.0M counted...
  Reviews: 4.0M counted...
  Reviews: 5.0M counted...
  Reviews: 6.0M counted...
  Reviews: 7.0M counted...
  Reviews: 8.0M counted...
  Reviews: 9.0M counted...
  Reviews: 10.0M counted...
  Reviews: 11.0M counted...
  Reviews: 12.0M counted...
  Reviews: 13.0M counted...
  Reviews: 14.0M counted...
  Reviews: 15.0M counted...
  Reviews: 16.0M counted...
  Reviews: 17.0M counted...
  Reviews: 18.0M counted...
  Reviews: 19.0M counted...
  Reviews: 20.0M counted...
  Reviews: 21.0M counted...
  Reviews: 22.0M counted...
  Reviews: 23.0M counted...
  Reviews: 24.0M counted...
  Reviews: 25.0M counted...
  Reviews: 26.0M counted...
  Reviews: 27.0M counted...
  Reviews: 28.0M counted...
  Reviews: 29.0M counted...
  Reviews: 30.0M counted...
  Reviews: 31.0M counted...
  Reviews: 32.0M counted...
  Reviews: 33.0M counted...
  Reviews: 34.0M counted...

In [4]:
import gzip
import json
import pandas as pd

print("Loading metadata to analyze product distribution...\n")

# Load all metadata into a list (1.6M products, should fit in memory)
products = []

with gzip.open(meta_file, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i % 100000 == 0 and i > 0:
            print(f"  Loaded {i:,} products...")
        product = json.loads(line)
        products.append(product)

print(f"\n✓ Loaded {len(products):,} products")

# Convert to DataFrame for analysis
df_meta = pd.DataFrame(products)

print("\nMetadata columns:")
print(df_meta.columns.tolist())

print("\nBasic stats:")
print(f"  Products with ratings: {df_meta['rating_number'].notna().sum():,}")
print(f"  Avg reviews per product: {df_meta['rating_number'].mean():.1f}")
print(f"  Max reviews on one product: {df_meta['rating_number'].max():,.0f}")

print("\nTop 10 most-reviewed products:")
print(df_meta.nlargest(10, 'rating_number')[['title', 'rating_number', 'average_rating', 'store']])

Loading metadata to analyze product distribution...

  Loaded 100,000 products...
  Loaded 200,000 products...
  Loaded 300,000 products...
  Loaded 400,000 products...
  Loaded 500,000 products...
  Loaded 600,000 products...
  Loaded 700,000 products...
  Loaded 800,000 products...
  Loaded 900,000 products...
  Loaded 1,000,000 products...
  Loaded 1,100,000 products...
  Loaded 1,200,000 products...
  Loaded 1,300,000 products...
  Loaded 1,400,000 products...
  Loaded 1,500,000 products...
  Loaded 1,600,000 products...

✓ Loaded 1,610,012 products

Metadata columns:
['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author']

Basic stats:
  Products with ratings: 1,610,012
  Avg reviews per product: 180.5
  Max reviews on one product: 1,034,896

Top 10 most-reviewed products:
                                                     title 

In [5]:
# Analyze category distribution
print("Analyzing categories...\n")

# Get products with 10,000+ reviews (high density)
high_density = df_meta[df_meta['rating_number'] >= 10000].copy()

print(f"Products with 10K+ reviews: {len(high_density):,}")
print(f"Total reviews in these products: {high_density['rating_number'].sum():,.0f}")

# Look at categories
print("\nTop categories by number of high-density products:")
category_counts = high_density['categories'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else 'Unknown').value_counts()
print(category_counts.head(20))

# Look at stores/brands
print("\nTop stores/brands by number of high-density products:")
store_counts = high_density['store'].value_counts()
print(store_counts.head(20))

# Show sample of non-Amazon top products
print("\nTop non-Amazon products (competitors):")
non_amazon = high_density[~high_density['store'].str.contains('Amazon', na=False, case=False)]
print(non_amazon.nlargest(15, 'rating_number')[['title', 'rating_number', 'average_rating', 'store']])

Analyzing categories...

Products with 10K+ reviews: 3,365
Total reviews in these products: 100,403,779

Top categories by number of high-density products:
categories
Electronics                     2890
Unknown                          461
Amazon Devices & Accessories      11
Car & Vehicle Electronics          3
Name: count, dtype: int64

Top stores/brands by number of high-density products:
store
Amazon            258
Amazon Basics     217
Apple              87
Logitech           87
SAMSUNG            80
SanDisk            77
TP-Link            48
ASURION            44
Ring               41
UGREEN             40
Amazon Renewed     40
Anker              32
JBL                32
JETech             31
Kingston           30
Sony               28
Welltin            26
CTYBB              26
ProCase            23
Rankie             22
Name: count, dtype: int64

Top non-Amazon products (competitors):
                                                     title  rating_number  \
1090452  Apple 

In [6]:
# Get product ASINs for our target products
# Option B: Multi-category sample

print("Building target product list...\n")

# Filter for high-density products (50K+ reviews for quality demo)
target_products = df_meta[df_meta['rating_number'] >= 50000].copy()

print(f"Products with 50K+ reviews: {len(target_products):,}")
print(f"Total reviews: {target_products['rating_number'].sum():,.0f}")

# Let's focus on specific competitive categories
# Extract category information better

def extract_main_category(cats):
    if isinstance(cats, list) and len(cats) > 0:
        # Get the most specific category (usually last in list)
        return cats[-1] if len(cats) > 1 else cats[0]
    return 'Unknown'

target_products['main_cat'] = target_products['categories'].apply(extract_main_category)

print("\nCategory breakdown:")
print(target_products['main_cat'].value_counts().head(20))

# Show top products by category
print("\n" + "="*60)
print("TOP PRODUCTS BY CATEGORY:")
print("="*60)

for category in target_products['main_cat'].value_counts().head(5).index:
    print(f"\n{category}:")
    cat_products = target_products[target_products['main_cat'] == category].nlargest(5, 'rating_number')
    for _, row in cat_products.iterrows():
        print(f"  • {row['title'][:60]}... ({row['rating_number']:,} reviews, {row['store']})")

Building target product list...

Products with 50K+ reviews: 380
Total reviews: 42,953,800

Category breakdown:
main_cat
Unknown                        100
Lightning Cables                58
Micro SD Cards                  18
Earbud Headphones               14
Cases                           13
USB Cables                      12
USB Flash Drives                 9
Mice                             9
Portable Bluetooth Speakers      8
Streaming Media Players          7
Repeaters                        6
Internal Solid State Drives      6
External Hard Drives             6
TV Wall & Ceiling Mounts         5
SD Cards                         5
Over-Ear Headphones              5
HDMI Cables                      4
Sleeves                          4
Routers                          4
USB-to-USB Adapters              3
Name: count, dtype: int64

TOP PRODUCTS BY CATEGORY:

Unknown:
  • Echo Dot (3rd Gen, 2018 release) - Smart speaker with Alexa ... (1,034,896 reviews, Amazon)
  • Fire TV Stick 4K

In [7]:
# Let's count actual reviews in our review file for top products
print("Counting actual reviews for top products...")
print("(This will take a few minutes scanning 43.9M reviews)\n")

from collections import defaultdict

review_counts = defaultdict(int)

with gzip.open(reviews_file, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i % 5000000 == 0 and i > 0:
            print(f"  Scanned {i / 1000000:.1f}M reviews...")
        
        review = json.loads(line)
        parent_asin = review.get('parent_asin')
        if parent_asin:
            review_counts[parent_asin] += 1

print(f"\n✓ Finished scanning reviews")
print(f"Unique products with reviews: {len(review_counts):,}")

# Get top 20 products by actual review count
top_products = sorted(review_counts.items(), key=lambda x: x[1], reverse=True)[:20]

print("\nTop 20 products by ACTUAL review count:")
for asin, count in top_products:
    # Look up product title from metadata
    product = df_meta[df_meta['parent_asin'] == asin]
    if len(product) > 0:
        title = product.iloc[0]['title'][:60]
        store = product.iloc[0]['store']
        print(f"  {count:,} reviews - {title}... ({store})")
    else:
        print(f"  {count:,} reviews - ASIN: {asin}")

Counting actual reviews for top products...
(This will take a few minutes scanning 43.9M reviews)

  Scanned 5.0M reviews...
  Scanned 10.0M reviews...
  Scanned 15.0M reviews...
  Scanned 20.0M reviews...
  Scanned 25.0M reviews...
  Scanned 30.0M reviews...
  Scanned 35.0M reviews...
  Scanned 40.0M reviews...

✓ Finished scanning reviews
Unique products with reviews: 1,609,860

Top 20 products by ACTUAL review count:
  178,239 reviews - Fire TV Stick with Alexa Voice Remote, streaming media playe... (Amazon)
  140,751 reviews - Fire TV Stick 4K streaming device with Alexa Voice Remote (i... (Amazon)
  119,051 reviews - Echo Dot (2nd Generation) - Smart speaker with Alexa - White... (Amazon)
  103,964 reviews - Fire Tablet with Alexa, 7" Display, 16 GB, Blue - with Speci... (Amazon)
  95,397 reviews - Echo Dot (3rd Gen, 2018 release) - Smart speaker with Alexa ... (Amazon)
  88,798 reviews - Fire TV Stick streaming device with Alexa built in, includes... (Amazon)
  72,566 reviews - T

In [8]:
# Create our target product list - top ~50 products by actual review count

# Convert our review_counts dict to DataFrame
review_counts_df = pd.DataFrame([
    {'parent_asin': asin, 'actual_reviews': count}
    for asin, count in review_counts.items()
])

# Merge with metadata
products_with_counts = df_meta.merge(
    review_counts_df,
    on='parent_asin',
    how='inner'
)

# Filter for products with 20K+ actual reviews (high density)
target_products = products_with_counts[
    products_with_counts['actual_reviews'] >= 20000
].copy()

print(f"Products with 20K+ actual reviews: {len(target_products)}")
print(f"Total actual reviews: {target_products['actual_reviews'].sum():,}\n")

# Show top 50
print("Top 50 products for our demo sample:")
print("="*80)

top_50 = target_products.nlargest(50, 'actual_reviews')

for i, row in top_50.iterrows():
    print(f"{row['actual_reviews']:>7,} reviews - {row['title'][:55]:55} ({row['store']})")

print("="*80)
print(f"\nTotal reviews in top 50: {top_50['actual_reviews'].sum():,}")

# Save this list
top_50_asins = top_50['parent_asin'].tolist()
print(f"\nSaving list of {len(top_50_asins)} product ASINs for sampling...")

Products with 20K+ actual reviews: 47
Total actual reviews: 2,051,569

Top 50 products for our demo sample:
178,239 reviews - Fire TV Stick with Alexa Voice Remote, streaming media  (Amazon)
140,751 reviews - Fire TV Stick 4K streaming device with Alexa Voice Remo (Amazon)
119,051 reviews - Echo Dot (2nd Generation) - Smart speaker with Alexa -  (Amazon)
103,964 reviews - Fire Tablet with Alexa, 7" Display, 16 GB, Blue - with  (Amazon)
 95,397 reviews - Echo Dot (3rd Gen, 2018 release) - Smart speaker with A (Amazon)
 88,798 reviews - Fire TV Stick streaming device with Alexa built in, inc (Amazon)
 72,566 reviews - TOZO T10 Bluetooth 5.3 Wireless Earbuds with Wireless C (TOZO)
 55,743 reviews - Panasonic ErgoFit Wired Earbuds, In-Ear Headphones with (Panasonic)
 51,867 reviews - Amazon Smart Plug, for home automation, Works with Alex (Amazon)
 51,568 reviews - OontZ Angle 3 Bluetooth Speaker, Portable Wireless Blue (Cambridge Soundworks)
 46,254 reviews - Echo Show 5 (1st Gen, 2019 re

In [9]:
# Extract all reviews for our top 47 products
import gzip
import json

print(f"Extracting reviews for {len(top_50_asins)} products...")
print(f"Target ASINs: {top_50_asins[:5]}... (and {len(top_50_asins)-5} more)\n")

# Create output file
output_file = r"C:\Users\shafe\OneDrive\Desktop\ecommerce-intelligence\data\raw\electronics_sample_2M.jsonl.gz"

extracted_count = 0
total_scanned = 0

with gzip.open(reviews_file, 'rt', encoding='utf-8') as infile:
    with gzip.open(output_file, 'wt', encoding='utf-8') as outfile:
        for line in infile:
            total_scanned += 1
            
            if total_scanned % 5000000 == 0:
                print(f"  Scanned {total_scanned/1000000:.1f}M reviews, extracted {extracted_count:,}...")
            
            review = json.loads(line)
            if review.get('parent_asin') in top_50_asins:
                outfile.write(line)
                extracted_count += 1

print(f"\n✓ Extraction complete!")
print(f"  Scanned: {total_scanned:,} total reviews")
print(f"  Extracted: {extracted_count:,} reviews for our sample")
print(f"  Output: {output_file}")
print(f"  File size: {Path(output_file).stat().st_size / (1024**2):.1f} MB")

Extracting reviews for 47 products...
Target ASINs: ['B075X8471B', 'B07GZFM1ZM', 'B01K8B8YA8', 'B010BWYDYA', 'B07H65KP63']... (and 42 more)

  Scanned 5.0M reviews, extracted 246,881...
  Scanned 10.0M reviews, extracted 506,470...
  Scanned 15.0M reviews, extracted 747,912...
  Scanned 20.0M reviews, extracted 1,016,386...
  Scanned 25.0M reviews, extracted 1,244,656...
  Scanned 30.0M reviews, extracted 1,478,314...
  Scanned 35.0M reviews, extracted 1,694,538...
  Scanned 40.0M reviews, extracted 1,893,535...

✓ Extraction complete!
  Scanned: 43,886,944 total reviews
  Extracted: 2,051,569 reviews for our sample
  Output: C:\Users\shafe\OneDrive\Desktop\ecommerce-intelligence\data\raw\electronics_sample_2M.jsonl.gz
  File size: 237.2 MB
