# PySpark Test Notebook
Test connection to Spark cluster

In [1]:
"""
Simple Word Count Example
Counts word frequency in a text
"""
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder \
    .appName("WordCount") \
    .master("local[*]") \
    .getOrCreate()

# Sample data
text_data = [
    "Apache Spark is amazing",
    "Spark is fast and powerful",
    "PySpark makes Spark easy to use",
    "Spark Spark Spark"
]

# Create RDD and perform word count
rdd = spark.sparkContext.parallelize(text_data)
word_counts = rdd.flatMap(lambda line: line.split()) \
    .map(lambda word: (word.lower(), 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], ascending=False)

print("\n=== Word Count Results ===")
for word, count in word_counts.collect():
    print(f"{word}: {count}")

spark.stop()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/09 01:10:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable



=== Word Count Results ===
spark: 6
is: 2
and: 1
powerful: 1
pyspark: 1
makes: 1
to: 1
use: 1
apache: 1
amazing: 1
fast: 1
easy: 1


In [2]:
"""
DataFrame Operations Example
Filter, select, and transform data
"""
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, upper

# Create Spark Session
spark = SparkSession.builder \
    .appName("DataFrameOperations") \
    .master("local[*]") \
    .getOrCreate()

# Sample data - Employee records
data = [
    ("John", "Sales", 5000, 28),
    ("Alice", "Engineering", 8000, 32),
    ("Bob", "Sales", 6000, 35),
    ("Charlie", "Engineering", 9000, 29),
    ("Diana", "HR", 5500, 30),
    ("Eve", "Engineering", 7500, 27)
]

columns = ["name", "department", "salary", "age"]
df = spark.createDataFrame(data, columns)

print("\n=== Original DataFrame ===")
df.show()

# Filter employees with salary > 6000
print("\n=== High Earners (Salary > 6000) ===")
df.filter(col("salary") > 6000).show()

# Select specific columns with transformation
print("\n=== Names and Departments (Uppercase) ===")
df.select(upper(col("name")).alias("name"), col("department")).show()

# Group by department and calculate average salary
print("\n=== Average Salary by Department ===")
df.groupBy("department").avg("salary").show()

spark.stop()



=== Original DataFrame ===
+-------+-----------+------+---+
|   name| department|salary|age|
+-------+-----------+------+---+
|   John|      Sales|  5000| 28|
|  Alice|Engineering|  8000| 32|
|    Bob|      Sales|  6000| 35|
|Charlie|Engineering|  9000| 29|
|  Diana|         HR|  5500| 30|
|    Eve|Engineering|  7500| 27|
+-------+-----------+------+---+


=== High Earners (Salary > 6000) ===
+-------+-----------+------+---+
|   name| department|salary|age|
+-------+-----------+------+---+
|  Alice|Engineering|  8000| 32|
|Charlie|Engineering|  9000| 29|
|    Eve|Engineering|  7500| 27|
+-------+-----------+------+---+


=== Names and Departments (Uppercase) ===
+-------+-----------+
|   name| department|
+-------+-----------+
|   JOHN|      Sales|
|  ALICE|Engineering|
|    BOB|      Sales|
|CHARLIE|Engineering|
|  DIANA|         HR|
|    EVE|Engineering|
+-------+-----------+


=== Average Salary by Department ===
+-----------+-----------------+
| department|      avg(salary)|
+----

In [3]:
"""
Data Aggregation Example
Statistical operations and aggregations
"""
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, max, min, sum, count

# Create Spark Session
spark = SparkSession.builder \
    .appName("DataAggregation") \
    .master("local[*]") \
    .getOrCreate()

# Sample data - Product sales
sales_data = [
    ("Laptop", "Electronics", 1200, 5),
    ("Mouse", "Electronics", 25, 50),
    ("Desk", "Furniture", 300, 10),
    ("Chair", "Furniture", 150, 20),
    ("Keyboard", "Electronics", 75, 30),
    ("Monitor", "Electronics", 400, 15),
    ("Table", "Furniture", 250, 8)
]

columns = ["product", "category", "price", "quantity"]
df = spark.createDataFrame(sales_data, columns)

print("\n=== Sales Data ===")
df.show()

# Calculate total revenue
df_with_revenue = df.withColumn("revenue", col("price") * col("quantity"))

print("\n=== Sales with Revenue ===")
df_with_revenue.show()

# Aggregate statistics by category
print("\n=== Category Statistics ===")
category_stats = df_with_revenue.groupBy("category").agg(
    count("product").alias("num_products"),
    sum("revenue").alias("total_revenue"),
    avg("price").alias("avg_price"),
    max("quantity").alias("max_quantity"),
    min("quantity").alias("min_quantity")
)
category_stats.show()

# Overall statistics
print("\n=== Overall Statistics ===")
overall_stats = df_with_revenue.agg(
    sum("revenue").alias("total_revenue"),
    avg("price").alias("avg_price"),
    count("product").alias("total_products")
)
overall_stats.show()

spark.stop()



=== Sales Data ===
+--------+-----------+-----+--------+
| product|   category|price|quantity|
+--------+-----------+-----+--------+
|  Laptop|Electronics| 1200|       5|
|   Mouse|Electronics|   25|      50|
|    Desk|  Furniture|  300|      10|
|   Chair|  Furniture|  150|      20|
|Keyboard|Electronics|   75|      30|
| Monitor|Electronics|  400|      15|
|   Table|  Furniture|  250|       8|
+--------+-----------+-----+--------+


=== Sales with Revenue ===
+--------+-----------+-----+--------+-------+
| product|   category|price|quantity|revenue|
+--------+-----------+-----+--------+-------+
|  Laptop|Electronics| 1200|       5|   6000|
|   Mouse|Electronics|   25|      50|   1250|
|    Desk|  Furniture|  300|      10|   3000|
|   Chair|  Furniture|  150|      20|   3000|
|Keyboard|Electronics|   75|      30|   2250|
| Monitor|Electronics|  400|      15|   6000|
|   Table|  Furniture|  250|       8|   2000|
+--------+-----------+-----+--------+-------+


=== Category Statistics =

In [4]:
"""
Join Operations Example
Combining multiple DataFrames
"""
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder \
    .appName("JoinOperations") \
    .master("local[*]") \
    .getOrCreate()

# Sample data - Customers
customers = [
    (1, "John Doe", "New York"),
    (2, "Jane Smith", "Los Angeles"),
    (3, "Bob Johnson", "Chicago"),
    (4, "Alice Brown", "Houston")
]
customer_df = spark.createDataFrame(customers, ["customer_id", "name", "city"])

# Sample data - Orders
orders = [
    (101, 1, 250.00),
    (102, 2, 150.00),
    (103, 1, 300.00),
    (104, 3, 200.00),
    (105, 2, 450.00),
    (106, 5, 100.00)  # customer_id 5 doesn't exist in customers
]
order_df = spark.createDataFrame(orders, ["order_id", "customer_id", "amount"])

print("\n=== Customers ===")
customer_df.show()

print("\n=== Orders ===")
order_df.show()

# Inner Join - only matching records
print("\n=== Inner Join: Customers with Orders ===")
inner_join = customer_df.join(order_df, "customer_id", "inner")
inner_join.show()

# Left Join - all customers, with or without orders
print("\n=== Left Join: All Customers ===")
left_join = customer_df.join(order_df, "customer_id", "left")
left_join.show()

# Aggregate: Total amount per customer
print("\n=== Total Order Amount per Customer ===")
customer_totals = customer_df.join(order_df, "customer_id", "left") \
    .groupBy("customer_id", "name", "city") \
    .agg({"amount": "sum"}) \
    .withColumnRenamed("sum(amount)", "total_amount") \
    .orderBy("customer_id")
customer_totals.show()

spark.stop()



=== Customers ===
+-----------+-----------+-----------+
|customer_id|       name|       city|
+-----------+-----------+-----------+
|          1|   John Doe|   New York|
|          2| Jane Smith|Los Angeles|
|          3|Bob Johnson|    Chicago|
|          4|Alice Brown|    Houston|
+-----------+-----------+-----------+


=== Orders ===
+--------+-----------+------+
|order_id|customer_id|amount|
+--------+-----------+------+
|     101|          1| 250.0|
|     102|          2| 150.0|
|     103|          1| 300.0|
|     104|          3| 200.0|
|     105|          2| 450.0|
|     106|          5| 100.0|
+--------+-----------+------+


=== Inner Join: Customers with Orders ===
+-----------+-----------+-----------+--------+------+
|customer_id|       name|       city|order_id|amount|
+-----------+-----------+-----------+--------+------+
|          1|   John Doe|   New York|     101| 250.0|
|          1|   John Doe|   New York|     103| 300.0|
|          2| Jane Smith|Los Angeles|     102|

In [5]:
"""
Reading and Writing CSV Files
File I/O operations with PySpark
"""
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month

# Create Spark Session
spark = SparkSession.builder \
    .appName("ReadWriteCSV") \
    .master("local[*]") \
    .getOrCreate()

# Create sample data
sample_data = [
    ("2024-01-15", "Product A", 100, 50),
    ("2024-01-20", "Product B", 150, 30),
    ("2024-02-10", "Product A", 120, 45),
    ("2024-02-15", "Product C", 200, 25),
    ("2024-03-05", "Product B", 180, 35),
    ("2024-03-20", "Product A", 110, 55)
]

columns = ["date", "product", "price", "quantity"]
df = spark.createDataFrame(sample_data, columns)

print("\n=== Original Data ===")
df.show()

# Write to CSV
output_path = "/data/sales_data.csv"
print(f"\n=== Writing to {output_path} ===")
df.write.mode("overwrite").option("header", "true").csv(output_path)
print("Data written successfully!")

# Read from CSV
print(f"\n=== Reading from {output_path} ===")
df_read = spark.read.option("header", "true").option("inferSchema", "true").csv(output_path)
df_read.show()

# Transform and write processed data
print("\n=== Processing and Writing Summary ===")
df_read_typed = df_read.withColumn("price", col("price").cast("int")) \
    .withColumn("quantity", col("quantity").cast("int"))

summary = df_read_typed.groupBy("product").agg(
    {"price": "avg", "quantity": "sum"}
).withColumnRenamed("avg(price)", "avg_price") \
 .withColumnRenamed("sum(quantity)", "total_quantity")

summary.show()

# Write summary
summary_path = "/data/sales_summary.csv"
summary.write.mode("overwrite").option("header", "true").csv(summary_path)
print(f"Summary written to {summary_path}")

# Show what files were created
print("\n=== Files Created ===")
print("1. /data/sales_data.csv/")
print("2. /data/sales_summary.csv/")

spark.stop()



=== Original Data ===
+----------+---------+-----+--------+
|      date|  product|price|quantity|
+----------+---------+-----+--------+
|2024-01-15|Product A|  100|      50|
|2024-01-20|Product B|  150|      30|
|2024-02-10|Product A|  120|      45|
|2024-02-15|Product C|  200|      25|
|2024-03-05|Product B|  180|      35|
|2024-03-20|Product A|  110|      55|
+----------+---------+-----+--------+


=== Writing to /data/sales_data.csv ===
Data written successfully!

=== Reading from /data/sales_data.csv ===
+----------+---------+-----+--------+
|      date|  product|price|quantity|
+----------+---------+-----+--------+
|2024-01-20|Product B|  150|      30|
|2024-02-10|Product A|  120|      45|
|2024-03-05|Product B|  180|      35|
|2024-03-20|Product A|  110|      55|
|2024-01-15|Product A|  100|      50|
|2024-02-15|Product C|  200|      25|
+----------+---------+-----+--------+


=== Processing and Writing Summary ===
+---------+--------------+---------+
|  product|total_quantity|a

In [2]:
"""
Simple Word Count Example
Counts word frequency in a text
"""
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder \
    .appName("WordCount") \
    .master("local[*]") \
    .getOrCreate()

# Sample data
text_data = [
    "Apache Spark is amazing",
    "Spark is fast and powerful",
    "PySpark makes Spark easy to use",
    "Spark Spark Spark"
]

# Create RDD and perform word count
rdd = spark.sparkContext.parallelize(text_data)
word_counts = rdd.flatMap(lambda line: line.split()) \
    .map(lambda word: (word.lower(), 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], ascending=False)

print("\n=== Word Count Results ===")
for word, count in word_counts.collect():
    print(f"{word}: {count}")

spark.stop()



JAVA_HOME is not set


PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

In [3]:
 spark = SparkSession.builder \
      .appName("WordCount") \
      .master("spark://spark-master:7077") \
      .config("spark.executor.memory", "1g") \
      .config("spark.executor.cores", "1") \
      .getOrCreate()


JAVA_HOME is not set


PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.