In [1]:
# ============================================================================
# SPARK SESSION CONFIGURATION - Connect to Cluster, MinIO, and PostgreSQL
# ============================================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time

print("Creating Spark Session...")
print("=" * 70)

# Create Spark Session with all configurations
spark = SparkSession.builder \
    .appName("DataLake - Medallion Architecture") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.cores.max", "3") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262,"
            "org.postgresql:postgresql:42.5.4") \
    .getOrCreate()

# Set log level to reduce noise
spark.sparkContext.setLogLevel("WARN")

print("‚úÖ Spark Session created successfully!")
print("=" * 70)

# ============================================================================
# VERIFY CLUSTER CONNECTION
# ============================================================================

print("\n" + "=" * 70)
print("SPARK CLUSTER INFORMATION")
print("=" * 70)

sc = spark.sparkContext

print(f"Spark Version: {spark.version}")
print(f"Master URL: {sc.master}")
print(f"Application Name: {sc.appName}")
print(f"Application ID: {sc.applicationId}")
print(f"Default Parallelism: {sc.defaultParallelism}")
print(f"Spark UI: http://localhost:4040")
print(f"Master UI: http://localhost:8080")

print("\nüìä Cluster Status:")
if sc.master.startswith("spark://"):
    print("‚úÖ Connected to Standalone Cluster")
else:
    print("‚ö†Ô∏è  Running in Local Mode")

print("=" * 70)

# ============================================================================
# TEST 1: VERIFY DISTRIBUTED PROCESSING
# ============================================================================

print("\n" + "=" * 70)
print("TEST 1: DISTRIBUTED PROCESSING")
print("=" * 70)

# Create test data distributed across workers
data = list(range(1, 1001))
rdd = sc.parallelize(data, 6)  # 6 partitions for 3 workers

print(f"Created RDD with {rdd.count()} elements")
print(f"Number of partitions: {rdd.getNumPartitions()}")
print(f"First partition sample: {rdd.glom().collect()[0][:5]}")

# Simple computation
result = rdd.map(lambda x: x * 2).sum()
print(f"Sum of doubled values: {result}")
print("‚úÖ Distributed processing working!")

print("=" * 70)

# ============================================================================
# TEST 2: MINIO (S3) CONNECTION - WRITE
# ============================================================================

print("\n" + "=" * 70)
print("TEST 2: MINIO CONNECTION - WRITE DATA")
print("=" * 70)

# Create sample DataFrame
sample_data = [
    (1, "Alice", 34, "Engineering", 95000.50),
    (2, "Bob", 45, "Sales", 78000.00),
    (3, "Cathy", 29, "Engineering", 88000.75),
    (4, "David", 52, "Management", 120000.00),
    (5, "Eve", 38, "Sales", 82000.25)
]

schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("age", IntegerType(), False),
    StructField("department", StringType(), False),
    StructField("salary", DoubleType(), False)
])

df = spark.createDataFrame(sample_data, schema)

print("Sample DataFrame:")
df.show()

# Write to MinIO Bronze layer
bronze_path = "s3a://bronze/test_employees/"
print(f"\nüìù Writing to MinIO: {bronze_path}")

try:
    df.write \
        .mode("overwrite") \
        .parquet(bronze_path)
    print("‚úÖ Successfully wrote data to MinIO Bronze layer!")
except Exception as e:
    print(f"‚ùå Error writing to MinIO: {str(e)}")

print("=" * 70)

# ============================================================================
# TEST 3: MINIO (S3) CONNECTION - READ
# ============================================================================

print("\n" + "=" * 70)
print("TEST 3: MINIO CONNECTION - READ DATA")
print("=" * 70)

print(f"üìñ Reading from MinIO: {bronze_path}")

try:
    df_read = spark.read.parquet(bronze_path)
    print(f"‚úÖ Successfully read {df_read.count()} records from MinIO!")
    print("\nData from MinIO:")
    df_read.show()
except Exception as e:
    print(f"‚ùå Error reading from MinIO: {str(e)}")

print("=" * 70)

# ============================================================================
# TEST 4: POSTGRESQL CONNECTION - WRITE
# ============================================================================

print("\n" + "=" * 70)
print("TEST 4: POSTGRESQL CONNECTION - WRITE DATA")
print("=" * 70)

# Add timestamp column
df_with_timestamp = df.withColumn("created_at", current_timestamp())

# PostgreSQL connection properties
postgres_url = "jdbc:postgresql://postgres:5432/gold_db"
postgres_properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}

table_name = "employees"

print(f"üìù Writing to PostgreSQL table: {table_name}")

try:
    df_with_timestamp.write \
        .jdbc(url=postgres_url, 
              table=table_name, 
              mode="overwrite", 
              properties=postgres_properties)
    print("‚úÖ Successfully wrote data to PostgreSQL!")
except Exception as e:
    print(f"‚ùå Error writing to PostgreSQL: {str(e)}")

print("=" * 70)

# ============================================================================
# TEST 5: POSTGRESQL CONNECTION - READ
# ============================================================================

print("\n" + "=" * 70)
print("TEST 5: POSTGRESQL CONNECTION - READ DATA")
print("=" * 70)

print(f"üìñ Reading from PostgreSQL table: {table_name}")

try:
    df_postgres = spark.read \
        .jdbc(url=postgres_url, 
              table=table_name, 
              properties=postgres_properties)
    
    print(f"‚úÖ Successfully read {df_postgres.count()} records from PostgreSQL!")
    print("\nData from PostgreSQL:")
    df_postgres.show(truncate=False)
    
    print("\nSchema:")
    df_postgres.printSchema()
    
except Exception as e:
    print(f"‚ùå Error reading from PostgreSQL: {str(e)}")

print("=" * 70)

# ============================================================================
# TEST 6: COMPLETE PIPELINE - Bronze ‚Üí Silver ‚Üí Gold
# ============================================================================

print("\n" + "=" * 70)
print("TEST 6: COMPLETE MEDALLION PIPELINE")
print("=" * 70)

# BRONZE: Raw data (already written above)
print("\nü•â BRONZE LAYER - Raw Data")
bronze_df = spark.read.parquet("s3a://bronze/test_employees/")
print(f"Records: {bronze_df.count()}")
bronze_df.show(5)

# SILVER: Clean and transform
print("\nü•à SILVER LAYER - Cleaned & Transformed Data")
silver_df = bronze_df \
    .filter(col("age") >= 30) \
    .withColumn("salary_grade", 
                when(col("salary") >= 100000, "High")
                .when(col("salary") >= 80000, "Medium")
                .otherwise("Low")) \
    .withColumn("processed_timestamp", current_timestamp())

silver_path = "s3a://silver/test_employees/"
silver_df.write.mode("overwrite").parquet(silver_path)
print(f"‚úÖ Wrote to Silver layer: {silver_path}")
print(f"Records: {silver_df.count()}")
silver_df.show()

# GOLD: Aggregated business metrics
print("\nü•á GOLD LAYER - Business Aggregations")
gold_df = silver_df.groupBy("department", "salary_grade") \
    .agg(
        count("*").alias("employee_count"),
        avg("salary").alias("avg_salary"),
        min("age").alias("min_age"),
        max("age").alias("max_age")
    ) \
    .orderBy("department", "salary_grade")

gold_path = "s3a://gold/test_employees_summary/"
gold_df.write.mode("overwrite").parquet(gold_path)
print(f"‚úÖ Wrote to Gold layer: {gold_path}")
print(f"Records: {gold_df.count()}")
gold_df.show()

# Write Gold layer to PostgreSQL for Metabase
gold_table = "employees_summary"
gold_df.write \
    .jdbc(url=postgres_url, 
          table=gold_table, 
          mode="overwrite", 
          properties=postgres_properties)
print(f"‚úÖ Wrote Gold layer to PostgreSQL table: {gold_table}")

print("=" * 70)

# ============================================================================
# SUMMARY
# ============================================================================

print("\n" + "=" * 70)
print("üéâ CONNECTION TEST SUMMARY")
print("=" * 70)

summary = """
‚úÖ Spark Standalone Cluster - Connected
‚úÖ MinIO (S3) - Read/Write Working
‚úÖ PostgreSQL - Read/Write Working
‚úÖ Medallion Pipeline - Functional

üìä Data Locations:
   Bronze: s3a://bronze/test_employees/
   Silver: s3a://silver/test_employees/
   Gold:   s3a://gold/test_employees_summary/
   
üíæ PostgreSQL Tables:
   - employees
   - employees_summary (for Metabase)

üåê Access Points:
   - Spark UI: http://localhost:4040
   - Master UI: http://localhost:8080
   - MinIO Console: http://localhost:9001
   - JupyterLab: http://localhost:8888
   - Metabase: http://localhost:3000
"""

print(summary)
print("=" * 70)

Creating Spark Session...
‚úÖ Spark Session created successfully!

SPARK CLUSTER INFORMATION
Spark Version: 3.5.0
Master URL: spark://spark-master:7077
Application Name: DataLake - Medallion Architecture
Application ID: app-20251126184040-0000
Default Parallelism: 2
Spark UI: http://localhost:4040
Master UI: http://localhost:8080

üìä Cluster Status:
‚úÖ Connected to Standalone Cluster

TEST 1: DISTRIBUTED PROCESSING
Created RDD with 1000 elements
Number of partitions: 6
First partition sample: [1, 2, 3, 4, 5]
Sum of doubled values: 1001000
‚úÖ Distributed processing working!

TEST 2: MINIO CONNECTION - WRITE DATA
Sample DataFrame:
+---+-----+---+-----------+--------+
| id| name|age| department|  salary|
+---+-----+---+-----------+--------+
|  1|Alice| 34|Engineering| 95000.5|
|  2|  Bob| 45|      Sales| 78000.0|
|  3|Cathy| 29|Engineering|88000.75|
|  4|David| 52| Management|120000.0|
|  5|  Eve| 38|      Sales|82000.25|
+---+-----+---+-----------+--------+


üìù Writing to MinIO: s

In [1]:
import pyspark
print(f"PySpark version: {pyspark.__version__}")

PySpark version: 3.5.0


In [11]:
%load_ext sql


The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [12]:
%sql spark://

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/sql/connection.py", line 45, in __init__
    engine = sqlalchemy.create_engine(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 2, in create_engine
  File "/opt/conda/lib/python3.11/site-packages/sqlalchemy/util/deprecations.py", line 281, in warned
    return fn(*args, **kwargs)  # type: ignore[no-any-return]
           ^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/sqlalchemy/engine/create.py", line 552, in create_engine
    entrypoint = u._get_entrypoint()
                 ^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/sqlalchemy/engine/url.py", line 754, in _get_entrypoint
    cls = registry.load(name)
          ^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/sqlalchemy/util/langhelpers.py", line 367, in load
    raise exc.NoSuchModuleError(
sqlalchemy.exc.NoSuchModuleError: Can't load plugin: sqlalchemy.dialects:spark

Conn

In [8]:
spark.read.csv("s3a://dev/transactions/").createOrReplaceTempView("transactions")


In [9]:
%%sql
select * from transactions

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/sql/magic.py", line 196, in execute
    conn = sql.connection.Connection.set(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/sql/connection.py", line 82, in set
    raise ConnectionError(
sql.connection.ConnectionError: Environment variable $DATABASE_URL not set, and no connect string given.

Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])


In [22]:
from pyspark.sql import SparkSession
from IPython.display import display
import pandas as pd

# Your existing Spark session
# Create Spark Session with all configurations
spark = SparkSession.builder \
    .appName("DataLake - Medallion Architecture") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.cores.max", "3") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262,"
            "org.postgresql:postgresql:42.5.4") \
    .getOrCreate()

# Simple SQL function
def sql(query):
    """Execute Spark SQL and return as pandas DataFrame"""
    return spark.sql(query).toPandas()

# Register your parquet files as tables
df = spark.read.csv("s3a://dev/transactions/", header=True)
df.printSchema()
df.createOrReplaceTempView("transactions")

print("‚úì Setup complete. Use sql('SELECT * FROM customers') to query")

root
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- client_id: string (nullable = true)
 |-- card_id: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_id: string (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- mcc: string (nullable = true)
 |-- errors: string (nullable = true)

‚úì Setup complete. Use sql('SELECT * FROM customers') to query


In [25]:
sql("select distinct mcc from transactions")

Unnamed: 0,mcc
0,8931
1,5732
2,6300
3,5045
4,5211
...,...
104,3006
105,7531
106,8111
107,5733
