In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, to_date, concat, lit

# Initialize Spark session
spark = SparkSession.builder \
    .appName("DataLake - Medallion Architecture") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.cores.max", "3") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262,"
            "org.postgresql:postgresql:42.5.4") \
    .getOrCreate()

# Read CSV from MinIO bucket
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "false") \
    .csv("s3a://dev/cards_data.csv")

print("Original Schema:")
df.printSchema()
df.show(5, truncate=False)

# Transform data with proper types
transformed_df = df \
    .withColumn("id", col("id").cast("integer")) \
    .withColumn("client_id", col("client_id").cast("integer")) \
    .withColumn("cvv", col("cvv").cast("integer")) \
    .withColumn("num_cards_issued", col("num_cards_issued").cast("integer")) \
    .withColumn("year_pin_last_changed", col("year_pin_last_changed").cast("integer")) \
    .withColumn("credit_limit", 
                regexp_replace(col("credit_limit"), "\\$", "").cast("decimal(10,2)")) \
    .withColumn("acct_open_date", 
                to_date(col("acct_open_date"), "MM/yyyy")) \
    .withColumn("expires", 
                to_date(concat(lit("01/"), col("expires")), "dd/MM/yyyy"))

print("\nTransformed Schema:")
transformed_df.printSchema()
transformed_df.show(5, truncate=False)

# PostgreSQL connection properties
# postgres_properties = {
#     "user": "postgres",
#     "password": "postgres",
#     "driver": "org.postgresql.Driver"
# }

# postgres_url = "jdbc:postgresql://localhost:5432/gold_db"

# # Write to PostgreSQL
# transformed_df.write \
#     .jdbc(url=postgres_url, 
#           table="cards", 
#           mode="append",
#           properties=postgres_properties)

transformed_df.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/gold_db") \
    .option("dbtable", "cards") \
    .option("user", "postgres") \
    .option("password", "postgres") \
    .option("driver", "org.postgresql.Driver") \
    .mode("append") \
    .save()

print("Data successfully loaded to PostgreSQL with proper types")

spark.stop()

Original Schema:
root
 |-- id: string (nullable = true)
 |-- client_id: string (nullable = true)
 |-- card_brand: string (nullable = true)
 |-- card_type: string (nullable = true)
 |-- card_number: string (nullable = true)
 |-- expires: string (nullable = true)
 |-- cvv: string (nullable = true)
 |-- has_chip: string (nullable = true)
 |-- num_cards_issued: string (nullable = true)
 |-- credit_limit: string (nullable = true)
 |-- acct_open_date: string (nullable = true)
 |-- year_pin_last_changed: string (nullable = true)
 |-- card_on_dark_web: string (nullable = true)

+----+---------+----------+---------------+----------------+-------+---+--------+----------------+------------+--------------+---------------------+----------------+
|id  |client_id|card_brand|card_type      |card_number     |expires|cvv|has_chip|num_cards_issued|credit_limit|acct_open_date|year_pin_last_changed|card_on_dark_web|
+----+---------+----------+---------------+----------------+-------+---+--------+----------