# Testing PySpark Analyzer Locally

This notebook demonstrates how to test the `pyspark-analyzer` package locally with Spark running in local mode.

## Prerequisites
- Java 17+ installed (required for PySpark)
- Python 3.8+
- The pyspark-analyzer package installed in development mode

## 1. Setup Java Environment and Imports

First, we need to ensure Java is properly configured for PySpark. The following cell will automatically detect and configure Java 17 if it's not already set.

In [None]:
# Setup Java environment for PySpark
import os
import sys
import subprocess

# Function to detect and set Java home
def setup_java_for_notebook():
    """Setup Java environment for PySpark in Jupyter notebook."""

    # Skip if already configured
    if os.environ.get("JAVA_HOME"):
        print(f"✅ Java already configured at: {os.environ['JAVA_HOME']}")
        return

    # Try to find Java 17 in common locations
    java_paths = [
        "/opt/homebrew/opt/openjdk@17",  # Apple Silicon Macs
        "/usr/local/opt/openjdk@17",      # Intel Macs
    ]

    for path in java_paths:
        if os.path.exists(path):
            os.environ["JAVA_HOME"] = path
            os.environ["PATH"] = f"{path}/bin:{os.environ.get('PATH', '')}"
            print(f"✅ Java configured at: {path}")
            break
    else:
        # Try to find Java using /usr/libexec/java_home on macOS
        try:
            result = subprocess.run(
                ["/usr/libexec/java_home", "-v", "17"],
                capture_output=True,
                text=True,
            )
            if result.returncode == 0:
                java_home = result.stdout.strip()
                os.environ["JAVA_HOME"] = java_home
                os.environ["PATH"] = f"{java_home}/bin:{os.environ.get('PATH', '')}"
                print(f"✅ Java configured at: {java_home}")
            else:
                print("❌ Could not find Java 17. Please install it using: brew install openjdk@17")
        except Exception as e:
            print(f"❌ Error finding Java: {e}")

    # Set other required environment variables
    os.environ.setdefault("SPARK_LOCAL_IP", "127.0.0.1")

    # Set PySpark to use the current Python interpreter
    if not os.environ.get("PYSPARK_PYTHON"):
        os.environ["PYSPARK_PYTHON"] = sys.executable
        os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

    # Reduce Spark log verbosity
    os.environ.setdefault("SPARK_SUBMIT_OPTS", "-Dlog4j.logLevel=ERROR")

# Run the setup
setup_java_for_notebook()

In [None]:
# Install required packages if not already installed
# Run this cell if you haven't installed the packages yet
!{sys.executable} -m pip install pyspark pandas numpy

In [ ]:
# Import required libraries
from datetime import datetime, timedelta
import random
import pandas as pd

# Add parent directory to path to import pyspark_analyzer
sys.path.insert(0, os.path.abspath('..'))

# Import PySpark
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, IntegerType, StringType,
    DoubleType, TimestampType, BooleanType
)

# Import pyspark_analyzer
from pyspark_analyzer import analyze

## 2. Initialize Spark Session

We'll create a local Spark session with some optimized settings for local development.

In [None]:
# Create Spark session with local mode
spark = SparkSession.builder \
    .appName("PySpark Analyzer Local Test") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.shuffle.partitions", "10") \
    .getOrCreate()

# Set log level to reduce output noise
spark.sparkContext.setLogLevel("WARN")

print(f"Spark version: {spark.version}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

## 3. Create Sample Datasets

Let's create various sample datasets to test different features of the analyzer.

In [None]:
# Create a sample e-commerce dataset
def create_ecommerce_data(num_rows=10000):
    """Create a sample e-commerce dataset for testing"""

    # Generate sample data
    data = []
    categories = ['Electronics', 'Clothing', 'Books', 'Home & Garden', 'Sports', None]
    payment_methods = ['Credit Card', 'PayPal', 'Debit Card', 'Apple Pay', None]

    for i in range(num_rows):
        order_date = datetime.now() - timedelta(days=random.randint(0, 365))

        data.append({
            'order_id': i + 1,
            'customer_id': random.randint(1, num_rows // 10),
            'product_name': f'Product_{random.randint(1, 1000)}',
            'category': random.choice(categories),
            'price': round(random.uniform(10, 1000), 2) if random.random() > 0.05 else None,
            'quantity': random.randint(1, 10),
            'order_date': order_date,
            'payment_method': random.choice(payment_methods),
            'is_returned': random.random() < 0.1,
            'rating': random.choice([1, 2, 3, 4, 5, None]),
            'discount_percentage': random.choice([0, 10, 20, 30, None])
        })

    # Create DataFrame
    schema = StructType([
        StructField("order_id", IntegerType(), False),
        StructField("customer_id", IntegerType(), True),
        StructField("product_name", StringType(), True),
        StructField("category", StringType(), True),
        StructField("price", DoubleType(), True),
        StructField("quantity", IntegerType(), True),
        StructField("order_date", TimestampType(), True),
        StructField("payment_method", StringType(), True),
        StructField("is_returned", BooleanType(), True),
        StructField("rating", IntegerType(), True),
        StructField("discount_percentage", IntegerType(), True)
    ])

    return spark.createDataFrame(data, schema)

# Create the dataset
df_ecommerce = create_ecommerce_data(10000)
print(f"Created e-commerce dataset with {df_ecommerce.count():,} rows and {len(df_ecommerce.columns)} columns")
df_ecommerce.show(5)

## 4. Basic Analysis

Let's run the analyzer with default settings.

In [None]:
# Run basic analysis (returns pandas DataFrame by default)
profile_df = analyze(df_ecommerce)

# Display the results
print("Column Statistics:")
profile_df

In [None]:
# Access metadata
print("Dataset Overview:")
for key, value in profile_df.attrs['overview'].items():
    print(f"  {key}: {value}")

## 5. Different Output Formats

In [None]:
# Get dictionary format for programmatic access
profile_dict = analyze(df_ecommerce, output_format="dict")

print("Overview:")
for key, value in profile_dict['overview'].items():
    print(f"  {key}: {value}")

print("\nSample column statistics (price):")
for key, value in profile_dict['columns']['price'].items():
    print(f"  {key}: {value}")

In [None]:
# Get human-readable summary
summary = analyze(df_ecommerce, output_format="summary")
print(summary)

## 6. Testing Sampling Features

In [None]:
# Create a larger dataset to test sampling
df_large = create_ecommerce_data(15000000)
print(f"Created large dataset with {df_large.count():,} rows")

# Test different sampling options
print("\n1. Auto-sampling (default behavior):")
profile_auto = analyze(df_large, output_format="dict")
print(f"   Sample size: {profile_auto['sampling']['sample_size']:,} rows")
print(f"   Is sampled: {profile_auto['sampling']['is_sampled']}")
if profile_auto['sampling']['is_sampled']:
    print(f"   Estimated speedup: {profile_auto['sampling']['estimated_speedup']:.1f}x")

In [None]:
# Test with specific target rows
print("\n2. Sample to specific number of rows:")
profile_target = analyze(df_large, target_rows=5000, seed=42, output_format="dict")
print(f"   Sample size: {profile_target['sampling']['sample_size']:,} rows")


In [None]:
# Test with fraction sampling
print("\n3. Sample by fraction:")
profile_fraction = analyze(df_large, fraction=0.1, seed=42, output_format="dict")
print(f"   Sample size: {profile_fraction['sampling']['sample_size']:,} rows")
print("   Fraction used: 0.1 (10%)")

In [None]:
# Test without sampling
print("\n4. Disable sampling:")
profile_no_sample = analyze(df_large, sampling=False, output_format="dict")
print(f"   Analyzed rows: {profile_no_sample['overview']['total_rows']:,}")
print(f"   Is sampled: {profile_no_sample['sampling']['is_sampled']}")

## 7. Advanced Statistics

In [None]:
# Enable advanced statistics
profile_advanced = analyze(df_ecommerce, include_advanced=True)

profile_advanced


## 8. Data Quality Analysis

In [None]:
# Enable data quality analysis
profile_quality = analyze(df_ecommerce, include_quality=False, include_advanced=False)

profile_quality

## 9. Profile Specific Columns

In [None]:
# Profile only specific columns
columns_to_profile = ['price', 'category', 'order_date', 'is_returned']
profile_subset = analyze(df_ecommerce, columns=columns_to_profile)

print(f"Profiled {len(profile_subset)} columns:")
profile_subset

## 10. Test with Different Data Types

In [None]:
# Create a dataset with various data types
from decimal import Decimal

complex_data = [
    {
        'id': i,
        'name': f'Item_{i}',
        'description': 'Lorem ipsum ' * random.randint(1, 10) if random.random() > 0.1 else None,
        'price_decimal': Decimal(str(round(random.uniform(10, 1000), 2))),
        'created_date': datetime.now() - timedelta(days=random.randint(0, 365)),
        'tags': ['tag1', 'tag2', 'tag3'][:random.randint(1, 3)],
        'metadata': {'key': 'value', 'count': random.randint(1, 100)},
        'is_active': random.choice([True, False, None]),
        'score': random.uniform(0, 100) if random.random() > 0.1 else float('nan'),
    }
    for i in range(1000)
]

df_complex = spark.createDataFrame(complex_data)
print("Complex dataset schema:")
df_complex.printSchema()

In [None]:
# Analyze complex dataset
profile_complex = analyze(df_complex)
print("Profile of complex data types:")
profile_complex[['column_name', 'data_type', 'null_count', 'distinct_count']]

## 11. Performance Comparison

In [None]:
import time

# Create datasets of different sizes
sizes = [1000, 10000, 50000]
results = []

for size in sizes:
    df_test = create_ecommerce_data(size)

    # With sampling
    start = time.time()
    _ = analyze(df_test, output_format="dict")
    time_with_sampling = time.time() - start

    # Without sampling
    start = time.time()
    _ = analyze(df_test, sampling=False, output_format="dict")
    time_without_sampling = time.time() - start

    results.append({
        'rows': size,
        'with_sampling': round(time_with_sampling, 2),
        'without_sampling': round(time_without_sampling, 2),
        'speedup': round(time_without_sampling / time_with_sampling, 1) if time_with_sampling > 0 else 1
    })

performance_df = pd.DataFrame(results)
print("Performance Comparison:")
performance_df

## 12. Export Results

In [ ]:
# Get profile as pandas DataFrame
import json

final_profile = analyze(df_ecommerce, include_advanced=True, include_quality=True)

# Save to different formats
output_dir = "profile_outputs"
os.makedirs(output_dir, exist_ok=True)

# CSV
final_profile.to_csv(f"{output_dir}/profile.csv", index=False)
print(f"Saved profile to {output_dir}/profile.csv")

# Parquet
final_profile.to_parquet(f"{output_dir}/profile.parquet", index=False)
print(f"Saved profile to {output_dir}/profile.parquet")

# HTML
final_profile.to_html(f"{output_dir}/profile.html", index=False)
print(f"Saved profile to {output_dir}/profile.html")

# JSON (using dict format)
profile_json = analyze(df_ecommerce, include_advanced=True, include_quality=True, output_format="dict")
with open(f"{output_dir}/profile.json", 'w') as f:
    json.dump(profile_json, f, indent=2, default=str)
print(f"Saved profile to {output_dir}/profile.json")

## 13. Cleanup

In [None]:
# Stop Spark session
spark.stop()
print("Spark session stopped.")

## Summary

This notebook demonstrated:
1. Setting up a local Spark session
2. Creating sample datasets for testing
3. Running basic analysis with default settings
4. Using different output formats (pandas, dict, summary)
5. Testing sampling features with various configurations
6. Enabling advanced statistics and data quality analysis
7. Profiling specific columns
8. Working with different data types
9. Performance comparison with and without sampling
10. Exporting results to various formats

### Next Steps
- Try with your own datasets
- Experiment with larger datasets to see sampling benefits
- Customize sampling thresholds and configurations
- Integrate the profiler into your data pipelines