In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.types import *

# Initialize Spark Session
# Initialize Spark session
spark = SparkSession.builder \
    .appName("DataLake - Medallion Architecture") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.cores.max", "3") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262,"
            "org.postgresql:postgresql:42.5.4") \
    .getOrCreate()

# Define schema for transactions
schema = StructType([
    StructField("id", LongType(), False),
    StructField("date", StringType(), False),  # Will convert to timestamp
    StructField("client_id", IntegerType(), False),
    StructField("card_id", IntegerType(), False),
    StructField("amount", StringType(), False),  # Read as string first
    StructField("use_chip", StringType(), False),
    StructField("merchant_id", IntegerType(), False),
    StructField("merchant_city", StringType(), True),
    StructField("merchant_state", StringType(), True),
    StructField("zip", StringType(), True),
    StructField("mcc", IntegerType(), True),
    StructField("errors", StringType(), True)
])

# Read CSV from MinIO, file size transactions_data.csv = 1.17GB
print("Reading data from MinIO...")
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv("s3a://dev/transactions_data.csv")

print(f"Number of partitions: {df.rdd.getNumPartitions()}")
print("Schema:", df.dtypes)
df.show(5, False)

Reading data from MinIO...
Number of partitions: 10
Schema: [('id', 'bigint'), ('date', 'string'), ('client_id', 'int'), ('card_id', 'int'), ('amount', 'string'), ('use_chip', 'string'), ('merchant_id', 'int'), ('merchant_city', 'string'), ('merchant_state', 'string'), ('zip', 'string'), ('mcc', 'int'), ('errors', 'string')]
+-------+-------------------+---------+-------+-------+-----------------+-----------+-------------+--------------+-------+----+------+
|id     |date               |client_id|card_id|amount |use_chip         |merchant_id|merchant_city|merchant_state|zip    |mcc |errors|
+-------+-------------------+---------+-------+-------+-----------------+-----------+-------------+--------------+-------+----+------+
|7475327|2010-01-01 00:01:00|1556     |2972   |$-77.00|Swipe Transaction|59935      |Beulah       |ND            |58523.0|5499|NULL  |
|7475328|2010-01-01 00:02:00|561      |4575   |$14.57 |Swipe Transaction|67570      |Bettendorf   |IA            |52722.0|5311|NULL  

In [32]:
spark.stop()

In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col, count
from pyspark.sql.types import *

# Initialize Spark Session
# Initialize Spark session
spark = SparkSession.builder \
    .appName("DataLake - Medallion Architecture") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.cores.max", "3") \
    .config("spark.sql.adaptive.enabled", "false") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262,"
            "org.postgresql:postgresql:42.5.4") \
    .getOrCreate()

# Define schema for transactions
schema = StructType([
    StructField("id", LongType(), False),
    StructField("date", StringType(), False),  # Will convert to timestamp
    StructField("client_id", IntegerType(), False),
    StructField("card_id", IntegerType(), False),
    StructField("amount", StringType(), False),  # Read as string first
    StructField("use_chip", StringType(), False),
    StructField("merchant_id", IntegerType(), False),
    StructField("merchant_city", StringType(), True),
    StructField("merchant_state", StringType(), True),
    StructField("zip", StringType(), True),
    StructField("mcc", IntegerType(), True),
    StructField("errors", StringType(), True)
])

# Read CSV from MinIO, file size transactions_data.csv = 1.17GB
print("Reading data from MinIO...")
df_src = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv("s3a://dev/transactions_data.csv")

df_flt = df_src.filter("year(transaction_date) = 2014")

df_final = df_flt.groupBy('mcc').agg(
    count('id').alias('transaction_count')
)

# Write to S3
df_final.write \
    .mode('overwrite') \
    .parquet('s3a://dev/mcc_aggregated/')

print("Data is successfully loaded in s3")

Reading data from MinIO...
Data is successfully loaded in s3


In [33]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col, count
from pyspark.sql.types import *

# Initialize Spark Session
# Initialize Spark session
spark = SparkSession.builder \
    .appName("DataLake - Medallion Architecture") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.cores.max", "3") \
    .config("spark.sql.adaptive.enabled", "false") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262,"
            "org.postgresql:postgresql:42.5.4") \
    .getOrCreate()

# Define schema for transactions
schema = StructType([
    StructField("id", LongType(), False),
    StructField("date", StringType(), False),  # Will convert to timestamp
    StructField("client_id", IntegerType(), False),
    StructField("card_id", IntegerType(), False),
    StructField("amount", StringType(), False),  # Read as string first
    StructField("use_chip", StringType(), False),
    StructField("merchant_id", IntegerType(), False),
    StructField("merchant_city", StringType(), True),
    StructField("merchant_state", StringType(), True),
    StructField("zip", StringType(), True),
    StructField("mcc", IntegerType(), True),
    StructField("errors", StringType(), True)
])

# Read CSV from MinIO, file size transactions_data.csv = 1.17GB
print("Reading data from MinIO...")
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv("s3a://dev/transactions_data.csv")

# Create temporary view
df.createOrReplaceTempView("transactions")

# Execute SQL query
df_final = spark.sql("""
    SELECT 
        mcc,
        COUNT(id) AS transaction_count
    FROM transactions
    GROUP BY mcc
""")


# Write to S3
df_final.write \
    .mode('overwrite') \
    .parquet('s3a://dev/mcc_aggregated/')

print("Data is successfully loaded in s3")

Reading data from MinIO...
Data is successfully loaded in s3
