In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.window import Window 

In [4]:
data = [
    ("Est", "Jan", 200),
    ("Est", "Feb", 300),
    ("Est", "Mar", 250),
    ("West", "Jan", 400),
    ("West", "Jan", 350),
    ("West", "Jan", 450),
]
columns = ["region", "month", "sales"]

sales_df = spark.createDataFrame(data, columns)

sales_df.show()

+------+-----+-----+
|region|month|sales|
+------+-----+-----+
|   Est|  Jan|  200|
|   Est|  Feb|  300|
|   Est|  Mar|  250|
|  West|  Jan|  400|
|  West|  Jan|  350|
|  West|  Jan|  450|
+------+-----+-----+



In [5]:
# Define window specification
window_space = Window.partitionBy("region").orderBy("sales")

#Add Cumulative sum and rank column
result_df = sales_df.withColumn("cumulative_sales", sum("sales").over(window_space)).withColumn("rank", rank().over(window_space))
result_df.show()

+------+-----+-----+----------------+----+
|region|month|sales|cumulative_sales|rank|
+------+-----+-----+----------------+----+
|   Est|  Jan|  200|             200|   1|
|   Est|  Mar|  250|             450|   2|
|   Est|  Feb|  300|             750|   3|
|  West|  Jan|  350|             350|   1|
|  West|  Jan|  400|             750|   2|
|  West|  Jan|  450|            1200|   3|
+------+-----+-----+----------------+----+

