# Apache Spark Exploration Notebook

This notebook demonstrates Apache Spark capabilities for data processing and analytics.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import seaborn as sns

# Create Spark session
spark = SparkSession.builder \
    .appName("SparkExploration") \
    .config("spark.master", "spark://spark-master:7077") \
    .getOrCreate()

print(f"Spark version: {spark.version}")
print(f"Spark context web UI: {spark.sparkContext.uiWebUrl}")

## Load and Explore Taxi Data

In [None]:
# Load taxi data
taxi_df = spark.read.csv("/home/jovyan/data/taxi_data.csv", header=True, inferSchema=True)

print(f"Dataset shape: {taxi_df.count()} rows, {len(taxi_df.columns)} columns")
taxi_df.printSchema()

In [None]:
# Basic statistics
taxi_df.describe().show()

## Data Visualization

In [None]:
# Convert to Pandas for visualization
sample_df = taxi_df.sample(0.1).toPandas()

# Trip distance distribution
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(sample_df['trip_distance'], bins=50, alpha=0.7)
plt.title('Trip Distance Distribution')
plt.xlabel('Distance (miles)')
plt.ylabel('Frequency')

# Fare amount distribution
plt.subplot(1, 2, 2)
plt.hist(sample_df['fare_amount'], bins=50, alpha=0.7)
plt.title('Fare Amount Distribution')
plt.xlabel('Fare ($)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Spark SQL Demo

In [None]:
# Create temporary view
taxi_df.createOrReplaceTempView("taxi_trips")

# SQL query example
result = spark.sql("""
    SELECT 
        payment_type,
        COUNT(*) as trip_count,
        AVG(fare_amount) as avg_fare,
        AVG(tip_amount) as avg_tip
    FROM taxi_trips
    GROUP BY payment_type
    ORDER BY trip_count DESC
""")

result.show()

## Performance Optimization

In [None]:
# Partitioning example
taxi_df.write \
    .mode("overwrite") \
    .partitionBy("payment_type") \
    .parquet("/home/jovyan/data/partitioned_taxi")

# Read partitioned data
partitioned_df = spark.read.parquet("/home/jovyan/data/partitioned_taxi")
print(f"Partitioned dataset loaded: {partitioned_df.count()} rows")

## Clean up

In [None]:
spark.stop()