In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DateType, IntegerType
from datetime import datetime
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, expr, when

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Create salesvar_tbl Table") \
    .getOrCreate()

# Define the schema for the salesvar_tbl table
schema = StructType([
    StructField("dt", DateType(), True),
    StructField("sales", IntegerType(), True)
])

# Insert records into salesvar_tbl
data = [
    (datetime.strptime('2023-10-03', '%Y-%m-%d'), 10),
    (datetime.strptime('2023-10-04', '%Y-%m-%d'), 20),
    (datetime.strptime('2023-10-05', '%Y-%m-%d'), 60),
    (datetime.strptime('2023-10-06', '%Y-%m-%d'), 50),
    (datetime.strptime('2023-10-07', '%Y-%m-%d'), 10)
]

# Create a DataFrame from the data and schema
df = spark.createDataFrame(data, schema)

# Show the DataFrame contents
df.display()


dt,sales
2023-10-03,10
2023-10-04,20
2023-10-05,60
2023-10-06,50
2023-10-07,10


In [0]:
# Optionally, register the DataFrame as a temporary view to run SQL queries
df.createOrReplaceTempView("salesvar_tbl")

# Example SQL query (optional)
spark.sql("SELECT * FROM salesvar_tbl").display()

dt,sales
2023-10-03,10
2023-10-04,20
2023-10-05,60
2023-10-06,50
2023-10-07,10


In [0]:
window_spec = Window.orderBy("dt")

# Create CTE by calculating previous sales using lag function
cte = df.withColumn("prev_sales", lag("sales", 1, 0).over(window_spec))

# Calculate percentage variance and filter rows where sales > prev_sales
result = cte.withColumn(
    "%var",
    when(
        col("prev_sales") != 0,
        ((col("sales") - col("prev_sales")) / col("prev_sales")) * 100,
    ),
).filter(col("sales") > col("prev_sales"))

# Show the result
result.select("dt", "sales", "%var").show()

+----------+-----+-----+
|        dt|sales| %var|
+----------+-----+-----+
|2023-10-03|   10| null|
|2023-10-04|   20|100.0|
|2023-10-05|   60|200.0|
+----------+-----+-----+



In [0]:
query = """
WITH cte AS (
    SELECT 
        dt, 
        sales,
        LAG(sales, 1, 0) OVER (ORDER BY dt) AS prev_sales
    FROM salesvar_tbl
)
SELECT 
    dt, 
    sales, 
    (CASE WHEN prev_sales != 0 THEN ((sales - prev_sales) / prev_sales) * 100 ELSE NULL END) AS `%var`
FROM cte
WHERE sales > prev_sales
"""

# Execute the query and show the result
result = spark.sql(query)
result.display()

dt,sales,%var
2023-10-03,10,
2023-10-04,20,100.0
2023-10-05,60,200.0
