#PROBLEM STATEMENT
Write a query to calculate the year-on-year growth rate for the total spend of each product.


In [0]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType, StructField, IntegerType, DateType
from pyspark.sql.functions import year, sum, lag, col, when, round, month
from pyspark.sql.functions import concat_ws
from datetime import datetime


# Initialize a Spark session
spark = SparkSession.builder.appName("user_transactions").getOrCreate()

# Define the schema
schema = "transaction_id INT, product_id INT, spend DOUBLE, transaction_date DATE"

# Create a DataFrame with the sample data
data = [
    (1341, 123424, 1500.60, datetime.strptime("2019-12-31", "%Y-%m-%d").date()),
    (1423, 123424, 1000.20, datetime.strptime("2020-11-30", "%Y-%m-%d").date()),
    (1623, 123424, 1246.44, datetime.strptime("2021-10-31", "%Y-%m-%d").date()),
    (1322, 123424, 2145.32, datetime.strptime("2022-09-30", "%Y-%m-%d").date()),
]

# Create the DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
df.display()

transaction_id,product_id,spend,transaction_date
1341,123424,1500.6,2019-12-31
1423,123424,1000.2,2020-11-30
1623,123424,1246.44,2021-10-31
1322,123424,2145.32,2022-09-30


###Year-wise-Year Growth

In [0]:
df = df.withColumn("transaction_year", year("transaction_date"))

# Calculate total spend per year
total_spend_per_year = df.groupBy("transaction_year").agg(
    sum("spend").alias("total_spend")
)

# Define a window specification for calculating the lag
windowSpec = Window.orderBy("transaction_year")

# Calculate the previous year's spend using the lag function
total_spend_per_year = total_spend_per_year.withColumn(
    "previous_year_spend", lag("total_spend").over(windowSpec)
)

# Calculate YoY growth percentage and round to two decimal places
total_spend_per_year = total_spend_per_year.withColumn(
    "yoy_growth",
    round(
        (col("total_spend") - col("previous_year_spend"))
        / col("previous_year_spend")
        * 100,
        2
    )
)

# Display the results
total_spend_per_year.orderBy("transaction_year").display()

transaction_year,total_spend,previous_year_spend,yoy_growth
2019,1500.6,,
2020,1000.2,1500.6,-33.35
2021,1246.44,1000.2,24.62
2022,2145.32,1246.44,72.12


###Month-wise-Month Growth

In [0]:
# Assuming df is already defined and has a column "transaction_date" and "spend"
# Extract year and month from the transaction_date
df = df.withColumn(
    "year_month",
    concat_ws("-", year(col("transaction_date")), month(col("transaction_date")))
)

# Aggregate the data to calculate total spend for each month
monthly_spend = df.groupBy("year_month").agg(sum("spend").alias("total_spend"))

# Define a window specification to calculate the lagged spend (previous month's spend)
window_spec = Window.orderBy("year_month")

# Calculate the previous month's spend
monthly_spend = monthly_spend.withColumn(
    "prev_month_spend", lag("total_spend").over(window_spec)
)

# Calculate the MoM growth rate and round to two decimal places
monthly_spend = monthly_spend.withColumn(
    "mom_growth",
    round(
        (col("total_spend") - col("prev_month_spend")) / col("prev_month_spend") * 100,
        2
    )
)

# Show the results
monthly_spend.orderBy("year_month").display()

year_month,total_spend,prev_month_spend,mom_growth
2019-12,1500.6,,
2020-11,1000.2,1500.6,-33.35
2021-10,1246.44,1000.2,24.62
2022-9,2145.32,1246.44,72.12


###Week-wise-Week growth 

In [0]:
windowSpec = Window.orderBy("transaction_date")

# Add a column for the spend of the previous week using the lag function
df = df.withColumn("spend_previous_week", lag("spend").over(windowSpec))

# Calculate week-over-week growth percentage and round to two decimal places
df = df.withColumn(
    "week_over_week_growth",
    when(col("spend_previous_week").isNull(), 0).otherwise(
        round(
            (col("spend") - col("spend_previous_week")) / col("spend_previous_week") * 100,
            2
        )
    )
)

# Select relevant columns for output
df_result = df.select(
    "transaction_date", "spend", "spend_previous_week", "week_over_week_growth"
)

# Show the resulting DataFrame
df_result.display()

transaction_date,spend,spend_previous_week,week_over_week_growth
2019-12-31,1500.6,,0.0
2020-11-30,1000.2,1500.6,-33.35
2021-10-31,1246.44,1000.2,24.62
2022-09-30,2145.32,1246.44,72.12


#Spark SQL


In [0]:
df.createOrReplaceTempView("user_transactions ")

###Year-wise-Year Growth

In [0]:
%sql
SELECT
  YEAR(transaction_date) AS year,
  SUM(spend) AS total_spend,
  LAG(SUM(spend), 1) OVER (
    ORDER BY
      YEAR(transaction_date)
  ) AS previous_year_spend,
  ROUND(
    (
      SUM(spend) - LAG(SUM(spend), 1) OVER (
        ORDER BY
          YEAR(transaction_date)
      )
    ) / NULLIF(LAG(SUM(spend), 1) OVER (
        ORDER BY
          YEAR(transaction_date)
      ), 0) * 100, 2
  ) AS yoy_growth_percentage
FROM
  user_transactions
GROUP BY
  YEAR
ORDER BY
  YEAR;

year,total_spend,previous_year_spend,yoy_growth_percentage
2019,1500.6,,
2020,1000.2,1500.6,-33.35
2021,1246.44,1000.2,24.62
2022,2145.32,1246.44,72.12


###Month-wise-Month Growth

In [0]:
%sql
SELECT
  DATE_FORMAT(transaction_date, 'yyyy-MM') AS year_month,
  SUM(spend) AS total_spend,
  LAG(SUM(spend), 1) OVER (
    ORDER BY
      DATE_FORMAT(transaction_date, 'yyyy-MM')
  ) AS previous_month_spend,
  ROUND(
    (
      (SUM(spend) - LAG(SUM(spend), 1) OVER (
        ORDER BY
          DATE_FORMAT(transaction_date, 'yyyy-MM')
      )) / NULLIF(LAG(SUM(spend), 1) OVER (
        ORDER BY
          DATE_FORMAT(transaction_date, 'yyyy-MM')
      ), 0)
    ) * 100,
    2
  ) AS mom_growth_percentage
FROM
  user_transactions
GROUP BY
  DATE_FORMAT(transaction_date, 'yyyy-MM')
ORDER BY
  year_month;



year_month,total_spend,previous_month_spend,mom_growth_percentage
2019-12,1500.6,,
2020-11,1000.2,1500.6,-33.35
2021-10,1246.44,1000.2,24.62
2022-09,2145.32,1246.44,72.12


###Week-wise-Week Growth

In [0]:
%sql
SELECT 
  transaction_date,
  spend,
  LAG(spend, 1) OVER (ORDER BY transaction_date) AS previous_week_spend,
  ROUND((spend - LAG(spend, 1) OVER (ORDER BY transaction_date)) / NULLIF(LAG(spend, 1) OVER (ORDER BY transaction_date), 0) * 100, 2) AS wow_growth_percentage
FROM 
  user_transactions
ORDER BY 
  transaction_date;

transaction_date,spend,previous_week_spend,wow_growth_percentage
2019-12-31,1500.6,,
2020-11-30,1000.2,1500.6,-33.35
2021-10-31,1246.44,1000.2,24.62
2022-09-30,2145.32,1246.44,72.12
