In [1]:
from pyspark.sql import SparkSession

# Your existing Spark session
# Create Spark Session with all configurations
spark_session = SparkSession.builder \
    .appName("app_silver_layer_data_viewer") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.cores.max", "1") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262,"
            "org.postgresql:postgresql:42.5.4") \
    .getOrCreate()

# Simple SQL function
def sql(query):
    """Execute Spark SQL and show data"""
    return spark_session.sql(query).show(10, False)

print("✓ Setup complete. Use sql('SELECT * FROM customers') to query")

✓ Setup complete. Use sql('SELECT * FROM customers') to query


In [2]:
print("Viewing transactions data from silver layer")
df = spark_session.read.parquet("s3a://silver/transactions/", header=True)
df.printSchema()
df.createOrReplaceTempView("transactions")

sql("SELECT * FROM transactions limit 10")

Viewing transactions data from silver layer
root
 |-- id: long (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- client_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_id: integer (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = true)

+-------+-------------------+---------+-------+------+------------------+-----------+-------------+--------------+-------+----+------+
|id     |date               |client_id|card_id|amount|use_chip          |merchant_id|merchant_city|merchant_state|zip    |mcc |errors|
+-------+-------------------+---------+-------+------+------------------+-----------+-------------+--------------+-------+----+------+
|7475327|2010-01-01 00:01:00|1556     |2972   |-77.00|Swipe Tra

In [11]:
print("Viewing cards data from silver layer")
df = spark_session.read.parquet("s3a://silver/cards/", header=True)
df.printSchema()
df.createOrReplaceTempView("cards")

sql("SELECT * FROM cards limit 10")

Viewing cards data from silver layer
root
 |-- id: integer (nullable = true)
 |-- client_id: integer (nullable = true)
 |-- card_brand: string (nullable = true)
 |-- card_type: string (nullable = true)
 |-- card_number: string (nullable = true)
 |-- expires: date (nullable = true)
 |-- cvv: integer (nullable = true)
 |-- has_chip: string (nullable = true)
 |-- num_cards_issued: integer (nullable = true)
 |-- credit_limit: decimal(10,2) (nullable = true)
 |-- acct_open_date: date (nullable = true)
 |-- year_pin_last_changed: integer (nullable = true)
 |-- card_on_dark_web: string (nullable = true)

+----+---------+----------+---------------+----------------+----------+---+--------+----------------+------------+--------------+---------------------+----------------+
|id  |client_id|card_brand|card_type      |card_number     |expires   |cvv|has_chip|num_cards_issued|credit_limit|acct_open_date|year_pin_last_changed|card_on_dark_web|
+----+---------+----------+---------------+--------------

In [12]:
print("Viewing mcc codes data from silver layer")
df = spark_session.read.parquet("s3a://silver/mcc_codes/", header=True)
df.printSchema()
df.createOrReplaceTempView("mcc_codes")

sql("SELECT * FROM mcc_codes limit 10")

Viewing mcc codes data from silver layer
root
 |-- mcc_code: integer (nullable = true)
 |-- description: string (nullable = true)

+--------+------------------------------------------+
|mcc_code|description                               |
+--------+------------------------------------------+
|5812    |Eating Places and Restaurants             |
|5541    |Service Stations                          |
|7996    |Amusement Parks, Carnivals, Circuses      |
|5411    |Grocery Stores, Supermarkets              |
|4784    |Tolls and Bridge Fees                     |
|4900    |Utilities - Electric, Gas, Water, Sanitary|
|5942    |Book Stores                               |
|5814    |Fast Food Restaurants                     |
|4829    |Money Transfer                            |
|5311    |Department Stores                         |
+--------+------------------------------------------+



In [13]:
print("Viewing users data from silver layer")
df = spark_session.read.parquet("s3a://silver/users/", header=True)
df.printSchema()
df.createOrReplaceTempView("users")

sql("SELECT * FROM users limit 10")

Viewing users data from silver layer
root
 |-- id: integer (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- retirement_age: integer (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- birth_month: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: string (nullable = true)
 |-- latitude: decimal(10,6) (nullable = true)
 |-- longitude: decimal(10,6) (nullable = true)
 |-- per_capita_income: decimal(12,2) (nullable = true)
 |-- yearly_income: decimal(12,2) (nullable = true)
 |-- total_debt: decimal(12,2) (nullable = true)
 |-- credit_score: integer (nullable = true)
 |-- num_credit_cards: integer (nullable = true)

+----+-----------+--------------+----------+-----------+------+------------------------+---------+-----------+-----------------+-------------+----------+------------+----------------+
|id  |current_age|retirement_age|birth_year|birth_month|gender|address                 |latitude |longitude  |per_capita_income|yearly_in

In [14]:
spark_session.stop()