# PySpark Quick Start with Spark Cluster

This notebook demonstrates how to use PySpark with a Spark cluster running in Docker.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, explode, split, lower

# Create Spark Session connected to the cluster
spark = SparkSession.builder \
    .appName("WordCount Example") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.cores.max", "2") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Master: {spark.sparkContext.master}")
print(f"Application ID: {spark.sparkContext.applicationId}")

## Example 1: Word Count

In [None]:
# Sample text data
text_data = [
    "Apache Spark is amazing",
    "Spark is fast and powerful",
    "PySpark makes Spark easy to use",
    "Spark Spark Spark"
]

# Create DataFrame
df = spark.createDataFrame([(line,) for line in text_data], ["line"])
df.show(truncate=False)

In [None]:
# Word count transformation
word_counts = df.select(explode(split(lower(col("line")), " ")).alias("word")) \
    .groupBy("word") \
    .count() \
    .orderBy(col("count").desc())

word_counts.show()

## Example 2: Working with Structured Data

In [None]:
# Create sample data
data = [
    ("Alice", 34, "Engineer"),
    ("Bob", 45, "Manager"),
    ("Charlie", 28, "Engineer"),
    ("Diana", 35, "Analyst"),
    ("Eve", 29, "Engineer")
]

columns = ["name", "age", "role"]
employees_df = spark.createDataFrame(data, columns)
employees_df.show()

In [None]:
# Filter and group operations
print("Engineers:")
employees_df.filter(col("role") == "Engineer").show()

print("\nCount by Role:")
employees_df.groupBy("role").count().show()

print("\nAverage Age by Role:")
employees_df.groupBy("role").avg("age").show()

## Example 3: Reading from Files

In [None]:
# Create a sample CSV file
csv_data = [
    ("product_1", 100, 25.50),
    ("product_2", 200, 15.75),
    ("product_3", 150, 30.00),
    ("product_4", 75, 45.25)
]

products_df = spark.createDataFrame(csv_data, ["product_id", "quantity", "price"])

# Save as CSV
products_df.write.mode("overwrite").csv("/data/products.csv", header=True)
print("CSV file created at /data/products.csv")

In [None]:
# Read the CSV file
loaded_df = spark.read.csv("/data/products.csv", header=True, inferSchema=True)
loaded_df.show()
loaded_df.printSchema()

In [None]:
# Calculate total value
from pyspark.sql.functions import round as spark_round

loaded_df.withColumn("total_value", spark_round(col("quantity") * col("price"), 2)).show()

## Monitor Your Spark Job

While running these cells, you can monitor your Spark jobs at:
- **Spark Master UI**: http://localhost:8080
- **Application UI**: http://localhost:4040 (when a job is running)
- **Worker 1 UI**: http://localhost:8081
- **Worker 2 UI**: http://localhost:8082

In [None]:
# Stop the Spark session when done
# spark.stop()