In [None]:
# Cell 1: Setup & Configuration

import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, expr

# Initialize Spark with MinIO/S3 capabilities
spark = SparkSession.builder \
    .appName("Jupyter_Data_Generator") \
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "admin") \
    .config("spark.hadoop.fs.s3a.secret.key", "password") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")

In [2]:
# Cell 2: Data Generation & Write

# Configs
ROW_COUNT = 10_000_000  # 10 Million rows
OUTPUT_PATH = "s3a://test-bucket/sales_data_skewed"

print(f"ðŸš€ GENERATING {ROW_COUNT:,} ROWS...")
start = time.time()

# 1. Generate Base Data
df = spark.range(0, ROW_COUNT)

# 2. Add Skew (70% USA)
# This creates a "Hot Key" that will choke one executor during a join
df_skew = df.select(
    col("id").alias("transaction_id"),
    (rand() * 1000).cast("int").alias("amount"),
    expr("CASE " +
         "WHEN rand() < 0.7 THEN 'USA' " +
         "WHEN rand() < 0.8 THEN 'IND' " +
         "WHEN rand() < 0.9 THEN 'UK' " +
         "ELSE 'Other' END").alias("country_code")
)

# 3. Write to MinIO (Simulate Small Files)
# repartition(500) forces Spark to create 500 tiny files. 
# This simulates a "fragmented" data lake.
print(f"ðŸ’¾ Writing to MinIO: {OUTPUT_PATH} (Partitioned into 500 files)")

(
    df_skew
    .repartition(500) 
    .write
    .format("delta")
    .mode("overwrite")
    .save(OUTPUT_PATH)
)

print(f"âœ… DONE! Time taken: {time.time() - start:.2f} seconds")

ðŸš€ GENERATING 10,000,000 ROWS...
ðŸ’¾ Writing to MinIO: s3a://test-bucket/sales_data_skewed (Partitioned into 500 files)


25/11/29 21:44:44 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/11/29 21:45:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

âœ… DONE! Time taken: 26.99 seconds


                                                                                