Problem Statement:

Write a SQL Query to find each month and country, the number of transactions and their total amount, the number of approved transactions and their amount.

In [0]:
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql.functions import date_format, col, count, sum as _sum

# Define the schema for the DataFrame
schema = StructType(
    [
        StructField("id", IntegerType(), True),
        StructField("country", StringType(), True),
        StructField("state", StringType(), True),
        StructField("amount", IntegerType(), True),
        StructField("trans_date", DateType(), True),
    ]
)

# Create the data with date objects
data = [
    (1, "US", "approved", 1000, datetime.strptime("2023-12-18", "%Y-%m-%d").date()),
    (2, "US", "declined", 2000, datetime.strptime("2023-12-19", "%Y-%m-%d").date()),
    (3, "US", "approved", 2000, datetime.strptime("2024-01-01", "%Y-%m-%d").date()),
    (4, "India", "approved", 2000, datetime.strptime("2023-01-07", "%Y-%m-%d").date()),
]

# Create the DataFrame
transactions_df = spark.createDataFrame(data, schema)

# display the DataFrame
transactions_df.display()

id,country,state,amount,trans_date
1,US,approved,1000,2023-12-18
2,US,declined,2000,2023-12-19
3,US,approved,2000,2024-01-01
4,India,approved,2000,2023-01-07


In [0]:
from pyspark.sql import functions as F

# Step 1: Aggregate for `cte`
cte = transactions_df.groupBy(
    F.date_format("trans_date", "yyyy-MM").alias("month"), "country"
).agg(F.count("*").alias("cnt"), F.sum("amount").alias("trans_total_amount"))

# Step 2: Aggregate for `cte2`
cte2 = (
    transactions_df.filter(F.col("state") == "approved")
    .groupBy(F.date_format("trans_date", "yyyy-MM").alias("month2"), "amount")
    .agg(F.count("*").alias("approved"))
)

# Step 3: Perform the INNER JOIN
result = (
    cte.alias("c1")
    .join(cte2.alias("c2"), cte["month"] == cte2["month2"])
    .select("c1.*", "c2.approved", "c2.amount")
)

# display the final result
result.display()

month,country,cnt,trans_total_amount,approved,amount
2023-12,US,2,3000,1,1000
2024-01,US,1,2000,1,2000
2023-01,India,1,2000,1,2000


In [0]:
transactions_df.createOrReplaceTempView("transactions")

In [0]:
# Use Spark SQL to perform the equivalent of your SQL Server query
query = """
WITH cte AS (
    SELECT 
        date_format(trans_date, 'yyyy-MM') AS month,
        country,
        COUNT(*) AS cnt,
        SUM(amount) AS trans_total_amount
    FROM transactions
    GROUP BY date_format(trans_date, 'yyyy-MM'), country
), 
cte2 AS (
    SELECT 
        date_format(trans_date, 'yyyy-MM') AS month2,
        COUNT(*) AS approved,
        amount
    FROM transactions
    WHERE state = 'approved'
    GROUP BY amount, date_format(trans_date, 'yyyy-MM')
)
SELECT 
    c1.*, 
    c2.approved, 
    c2.amount
FROM cte c1
INNER JOIN cte2 c2
ON c1.month = c2.month2
"""

# Execute the query
result_df = spark.sql(query)

# display the result
result_df.display()

month,country,cnt,trans_total_amount,approved,amount
2023-12,US,2,3000,1,1000
2024-01,US,1,2000,1,2000
2023-01,India,1,2000,1,2000


Explanation:

WITH CTEs:

cte: 

Aggregates transactions by month (date_format(trans_date, 'yyyy-MM')) and country, calculating cnt (count of transactions) and trans_total_amount (sum of amounts).

cte2: 

Filters only approved transactions, grouping by amount and month.
Join Operation:

The final query joins the results of cte and cte2 on the month field.

Functions Used:

date_format: Converts the trans_date into a yyyy-MM string format.
count and _sum: Aggregate functions for counting rows and summing amounts.

Execution:

The SQL query is run using spark.sql() after registering the DataFrame as a temporary SQL table.