In [2]:
import os
import socket
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum

print(f"Hostname of this container: {socket.gethostname()}")

# ---------------------------------------------------------------------------
# 1. Initialize Spark Session
# ---------------------------------------------------------------------------
# We explicitly point to the internal Docker network aliases.
# We also set "dfs.client.use.datanode.hostname" to "true" to fix Docker networking issues.
spark = SparkSession.builder \
    .appName("FullStackTest") \
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Master: {spark.sparkContext.master}")
print("-" * 30)

# ---------------------------------------------------------------------------
# 2. Test Distributed Processing (Spark Worker Check)
# ---------------------------------------------------------------------------
print("Generating sample data in memory...")

# Create a simple dataset: Sales data
data = [
    ("Store_A", 100), ("Store_A", 200), ("Store_A", 150),
    ("Store_B", 300), ("Store_B", 100),
    ("Store_C", 50),  ("Store_C", 50)
]
columns = ["Store", "Sales"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Perform a simple GroupBy aggregation
# This forces the Task to be sent to the Worker
print("Running aggregation (Count & Sum)...")
result_df = df.groupBy("Store").agg(_sum("Sales").alias("Total_Sales"))
result_df.show()

# ---------------------------------------------------------------------------
# 3. Test HDFS Write/Read (NameNode & DataNode Check)
# ---------------------------------------------------------------------------
hdfs_path = "hdfs://namenode:9000/user/test_output/sales_report"

print(f"Attempting to write to HDFS: {hdfs_path} ...")
try:
    # Write the result to HDFS (Overwriting if exists)
    result_df.write.mode("overwrite").csv(hdfs_path)
    print("Write SUCCESS!")
except Exception as e:
    print(f"Write FAILED: {e}")

print(f"Attempting to read back from HDFS...")
try:
    # Read it back to verify data integrity
    read_df = spark.read.csv(hdfs_path)
    print(f"Read SUCCESS! Row count: {read_df.count()}")
    read_df.show()
except Exception as e:
    print(f"Read FAILED: {e}")

# ---------------------------------------------------------------------------
# 4. Stop
# ---------------------------------------------------------------------------
spark.stop()
print("Spark Session Stopped.")

Hostname of this container: dfd292ffed16
Spark Version: 3.5.0
Master: spark://spark-master:7077
------------------------------
Generating sample data in memory...
Running aggregation (Count & Sum)...
+-------+-----------+
|  Store|Total_Sales|
+-------+-----------+
|Store_A|        450|
|Store_B|        400|
|Store_C|        100|
+-------+-----------+

Attempting to write to HDFS: hdfs://namenode:9000/user/test_output/sales_report ...
Write SUCCESS!
Attempting to read back from HDFS...
Read SUCCESS! Row count: 3
+-------+---+
|    _c0|_c1|
+-------+---+
|Store_A|450|
|Store_B|400|
|Store_C|100|
+-------+---+

Spark Session Stopped.
