In [1]:
from pyspark.sql import functions
from pyspark.sql.functions import when, count, isnan, col
from datetime import datetime as dt

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.master("spark://pop-os.localdomain:7077")\
    .appName("MoM.com")\
    .config("spark.jars", "mysql-connector-j-8.0.33.jar")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.memory", "14g").getOrCreate()

23/10/25 23:17:59 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.100.5 instead (on interface wlo1)
23/10/25 23:17:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/10/25 23:18:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark

In [5]:
import configparser
config_file_path = "/opt/spark/conf/spark-config.conf"
# Read secrete from config
config = configparser.ConfigParser()
config.read(config_file_path)

['/opt/spark/conf/spark-config.conf']

In [6]:
def read_from_mysql(spark, table_name):
    dbDriver = config.get("db", "driver")
    dbUrl = config.get("db", "url")
    dbUsername = config.get("db", "username")
    dbPassword = config.get("db", "password")
    return (
        spark.read.format("jdbc")
        .option("driver", dbDriver)
        .option("url", dbUrl)
        .option("dbtable", table_name)
        .option("user", dbUsername)
        .option("password", dbPassword)
        .load()
    )

In [7]:
df = read_from_mysql(spark, "Sales")

In [8]:
df.show()

                                                                                

+-------+----------+-------+
|sale_id| sale_date|revenue|
+-------+----------+-------+
|      1|2022-01-31| 651.15|
|      2|2022-02-28| 761.09|
|      3|2022-03-31| 654.18|
|      4|2022-04-30| 987.64|
|      5|2022-05-31| 975.64|
|      6|2022-06-30| 915.30|
|      7|2022-07-31| 649.55|
|      8|2022-08-31| 501.87|
|      9|2022-09-30| 560.70|
|     10|2022-10-31| 720.50|
|     11|2022-11-30| 807.34|
|     12|2022-12-31| 980.20|
+-------+----------+-------+



In [9]:
df.createOrReplaceTempView("mom_table")

In [10]:
SQL_QUERY = """WITH MonthlyRevenue AS (
  SELECT
    YEAR(sale_date) AS year,
    MONTH(sale_date) AS month,
    SUM(revenue) AS total_revenue
  FROM mom_table
  WHERE YEAR(sale_date) = 2022
  GROUP BY year, month
)
SELECT
  CONCAT(mr.year, '-', LPAD(mr.month, 2, '0')) AS month,
  mr.total_revenue AS revenue,
  mr.total_revenue - LAG(mr.total_revenue) OVER (ORDER BY mr.year, mr.month) AS delta,
(mr.total_revenue - LAG(mr.total_revenue) OVER (ORDER BY mr.year, mr.month)) / LAG(mr.total_revenue) OVER (ORDER BY mr.year, mr.month) AS mom_growth_rate,
  CASE
    WHEN LAG(mr.total_revenue) OVER (ORDER BY mr.year, mr.month) IS NULL THEN 0.0
    ELSE ((mr.total_revenue - LAG(mr.total_revenue) OVER (ORDER BY mr.year, mr.month)) / LAG(mr.total_revenue) OVER (ORDER BY mr.year, mr.month)) * 100.0
  END AS mom_growth_percentage
FROM MonthlyRevenue mr
ORDER BY mr.year, mr.month;"""

In [12]:
spark.sql(SQL_QUERY).show()

23/10/25 23:18:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/25 23:18:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/25 23:18:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/25 23:18:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/25 23:18:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/25 23:18:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/25 2

+-------+-------+-------+--------------------+---------------------+
|  month|revenue|  delta|     mom_growth_rate|mom_growth_percentage|
+-------+-------+-------+--------------------+---------------------+
|2022-01| 651.15|   NULL|                NULL|      0.0000000000000|
|2022-02| 761.09| 109.94| 0.16883974506642095|     16.8839745066421|
|2022-03| 654.18|-106.91|-0.14046958966745063|    -14.0469589667451|
|2022-04| 987.64| 333.46| 0.50973738114891926|     50.9737381148919|
|2022-05| 975.64| -12.00|-0.01215017617755457|     -1.2150176177555|
|2022-06| 915.30| -60.34|-0.06184658275593457|     -6.1846582755935|
|2022-07| 649.55|-265.75|-0.29034196438326232|    -29.0341964383262|
|2022-08| 501.87|-147.68|-0.22735740127780771|    -22.7357401277808|
|2022-09| 560.70|  58.83| 0.11722159124872975|     11.7221591248730|
|2022-10| 720.50| 159.80| 0.28500089174246478|     28.5000891742465|
|2022-11| 807.34|  86.84| 0.12052741151977793|     12.0527411519778|
|2022-12| 980.20| 172.86| 0.214110

In [13]:
spark.stop()