In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.types import *

# Initialize Spark Session with MinIO and PostgreSQL configurations
spark = SparkSession.builder \
    .appName("DevTransactionLoader") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

# Define schema for transactions
schema = StructType([
    StructField("id", LongType(), False),
    StructField("date", StringType(), False),  # Will convert to timestamp
    StructField("client_id", IntegerType(), False),
    StructField("card_id", IntegerType(), False),
    StructField("amount", StringType(), False),  # Read as string first
    StructField("use_chip", StringType(), False),
    StructField("merchant_id", IntegerType(), False),
    StructField("merchant_city", StringType(), True),
    StructField("merchant_state", StringType(), True),
    StructField("zip", StringType(), True),
    StructField("mcc", IntegerType(), True),
    StructField("errors", StringType(), True)
])

# Read CSV from MinIO
print("Reading data from MinIO...")
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv("s3a://dev/transactions_data.csv")

# Transform data
print("Transforming data...")
df_transformed = df \
    .withColumn("amount", regexp_replace(col("amount"), "\\$", "").cast(DecimalType(10, 2))) \
    .withColumn("date", col("date").cast(TimestampType()))

# Show sample data
print("Sample transformed data:")
df_transformed.show(5, truncate=False)

# Write to PostgreSQL
print("Writing to PostgreSQL...")
df_transformed.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/gold_db") \
    .option("dbtable", "transactions") \
    .option("user", "postgres") \
    .option("password", "postgres") \
    .option("driver", "org.postgresql.Driver") \
    .mode("append") \
    .save()

print("Data loaded successfully!")

# Stop Spark session
spark.stop()

Reading data from MinIO...
Transforming data...
Sample transformed data:
+-------+-------------------+---------+-------+------+-----------------+-----------+-------------+--------------+-------+----+------+
|id     |date               |client_id|card_id|amount|use_chip         |merchant_id|merchant_city|merchant_state|zip    |mcc |errors|
+-------+-------------------+---------+-------+------+-----------------+-----------+-------------+--------------+-------+----+------+
|7475327|2010-01-01 00:01:00|1556     |2972   |-77.00|Swipe Transaction|59935      |Beulah       |ND            |58523.0|5499|NULL  |
|7475328|2010-01-01 00:02:00|561      |4575   |14.57 |Swipe Transaction|67570      |Bettendorf   |IA            |52722.0|5311|NULL  |
|7475329|2010-01-01 00:02:00|1129     |102    |80.00 |Swipe Transaction|27092      |Vista        |CA            |92084.0|4829|NULL  |
|7475331|2010-01-01 00:05:00|430      |2860   |200.00|Swipe Transaction|27092      |Crown Point  |IN            |46307.0|48