In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from tqdm import tqdm

sns.set(style="whitegrid")

# -------------------------------
# Paths (change to your paths)
# -------------------------------
reviews_jsonl = "data/Electronics.jsonl"
meta_jsonl = "data/meta_Electronics.jsonl"
reviews_csv = "preprocessed_data/electronics_reviews_cleaned.csv"
meta_csv = "preprocessed_data/electronics_meta_cleaned.csv"
merged_csv = "preprocessed_data/merged_reviews_metadata.csv"

# -------------------------------
# Quick look at raw JSONL files
# -------------------------------
print("\nSample lines from reviews JSONL")
with open(reviews_jsonl, "r") as f:
    for i, line in enumerate(f):
        if i >= 3:
            break
        obj = json.loads(line)
        print(json.dumps(obj, indent=2))

print("\nSample lines from metadata JSONL")
with open(meta_jsonl, "r") as f:
    for i, line in enumerate(f):
        if i >= 3:
            break
        obj = json.loads(line)
        print(json.dumps(obj, indent=2))

# -------------------------------
# Load Cleaned Reviews CSV & Look
# -------------------------------
if os.path.exists(reviews_csv):
    print("\nReading cleaned reviews...")
    df_reviews = pd.read_csv(reviews_csv)
    print(df_reviews.head())
    print(f"Shape: {df_reviews.shape}")

    # Review length distribution
    df_reviews["review_length"] = df_reviews["cleaned_review"].astype(str).apply(lambda x: len(x.split()))
    plt.figure(figsize=(10, 5))
    sns.histplot(df_reviews["review_length"], bins=50)
    plt.title("Review Length Distribution")
    plt.xlabel("Number of Words")
    plt.ylabel("Count")
    plt.show()

    # Rating distribution
    plt.figure(figsize=(6, 4))
    sns.countplot(x="rating", data=df_reviews)
    plt.title("Rating Distribution")
    plt.show()

    # Sample reviews
    print("\nSample cleaned reviews:")
    print(df_reviews["cleaned_review"].sample(5, random_state=42).tolist())

# -------------------------------
# Load Cleaned Metadata CSV & Look
# -------------------------------
if os.path.exists(meta_csv):
    print("\nReading cleaned metadata...")
    df_meta = pd.read_csv(meta_csv)
    print(df_meta.head())
    print(f"Shape: {df_meta.shape}")

    # Price distribution
    df_meta["price"] = pd.to_numeric(df_meta["price"], errors="coerce")
    plt.figure(figsize=(10, 5))
    sns.histplot(df_meta["price"].dropna(), bins=50)
    plt.title("Price Distribution")
    plt.xlabel("Price")
    plt.ylabel("Count")
    plt.show()

    # Sample metadata
    print("\nSample cleaned metadata:")
    print(df_meta["cleaned_metadata"].sample(5, random_state=42).tolist())

# -------------------------------
# Load Merged Data CSV & Look
# -------------------------------
if os.path.exists(merged_csv):
    print("\nReading merged dataset...")
    df_merged = pd.read_csv(merged_csv)
    print(df_merged.head())
    print(f"Shape: {df_merged.shape}")

    # Correlation: Rating vs Price
    plt.figure(figsize=(6, 5))
    sns.scatterplot(data=df_merged, x="price", y="rating", alpha=0.3)
    plt.title("Price vs Rating")
    plt.show()

    # Price distribution (after merge)
    plt.figure(figsize=(10, 5))
    sns.histplot(df_merged["price"], bins=50)
    plt.title("Price Distribution (Merged Data)")
    plt.show()

    # Look at extreme cases
    print("\nTop expensive products:")
    print(df_merged.sort_values("price", ascending=False).head(10)[["asin", "price", "rating"]])

    print("\nLow-rated products:")
    print(df_merged[df_merged["rating"] <= 2][["asin", "rating", "cleaned_review"]].sample(5, random_state=42))

# -------------------------------
# Optional sanity checks
# -------------------------------
print("\nChecking for missing values in merged data...")
if os.path.exists(merged_csv):
    null_counts = df_merged.isnull().sum()
    print(null_counts)