In [0]:
from pyspark.sql.types import *

# Initialize Spark session

# Define schema
schema = StructType(
    [
        StructField("customer_id", IntegerType(), True),
        StructField(
            "transactions", ArrayType(MapType(StringType(), StringType())), True
        ),
    ]
)

# Sample data
data = [
    (
        1,
        [
            {
                "transaction_id": "101",
                "transaction_date": "2024-01-10",
                "transaction_amount": "50.0",
            },
            {
                "transaction_id": "102",
                "transaction_date": "2024-02-15",
                "transaction_amount": "100.0",
            },
        ],
    ),
    (
        2,
        [
            {
                "transaction_id": "103",
                "transaction_date": "2024-03-01",
                "transaction_amount": "75.0",
            }
        ],
    ),
    (
        3,
        [
            {
                "transaction_id": "104",
                "transaction_date": "2023-12-20",
                "transaction_amount": "120.0",
            },
            {
                "transaction_id": "105",
                "transaction_date": "2024-01-25",
                "transaction_amount": "200.0",
            },
            {
                "transaction_id": "106",
                "transaction_date": "2024-02-10",
                "transaction_amount": "90.0",
            },
        ],
    ),
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# display DataFrame
df.display()

customer_id,transactions
1,"List(Map(transaction_date -> 2024-01-10, transaction_id -> 101, transaction_amount -> 50.0), Map(transaction_date -> 2024-02-15, transaction_id -> 102, transaction_amount -> 100.0))"
2,"List(Map(transaction_date -> 2024-03-01, transaction_id -> 103, transaction_amount -> 75.0))"
3,"List(Map(transaction_date -> 2023-12-20, transaction_id -> 104, transaction_amount -> 120.0), Map(transaction_date -> 2024-01-25, transaction_id -> 105, transaction_amount -> 200.0), Map(transaction_date -> 2024-02-10, transaction_id -> 106, transaction_amount -> 90.0))"


In [0]:
from pyspark.sql.functions import *

exploded_df = df.withColumn("transaction", explode("transactions"))

result_df = (
    exploded_df.select(
        col("customer_id"),
        col("transaction.transaction_amount").alias("transaction_amount"),
        col("transaction.transaction_date").alias("transaction_date"),
    )
    .groupBy("customer_id")
    .agg(
        count("transaction_date").alias("total_transactions"),
        sum("transaction_amount").alias("total_spend"),
        max("transaction_date").alias("most_recent_transaction"),
    )
    .orderBy("customer_id")
)

result_df.display()

customer_id,total_transactions,total_spend,most_recent_transaction
1,2,150.0,2024-02-15
2,1,75.0,2024-03-01
3,3,410.0,2024-02-10


In [0]:
# Register the exploded DataFrame as a temporary view
exploded_df.createOrReplaceTempView("transactions_view")

# Use Spark SQL to perform the transformation
result_df = spark.sql("""
    SELECT 
        customer_id,
        COUNT(transaction.transaction_date) AS total_transactions,
        SUM(CAST(transaction.transaction_amount AS FLOAT)) AS total_spend,
        MAX(transaction.transaction_date) AS most_recent_transaction
    FROM transactions_view
    GROUP BY customer_id
    ORDER BY customer_id
""")

# Show the result
result_df.display()

customer_id,total_transactions,total_spend,most_recent_transaction
1,2,150.0,2024-02-15
2,1,75.0,2024-03-01
3,3,410.0,2024-02-10
