In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
schema1= StructType([StructField("Order_ID", IntegerType()),
    StructField("Order_Date",DateType()),
    StructField("Order_value",IntegerType())
])

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

path="dbfs:/FileStore/file-1.csv"
df=spark.read.option("sep","\t").option("header","false").option("mode","permissive").option("dateFormat","dd-MM-YYYY").csv(path,schema1)

In [0]:
df.show()

+--------+----------+-----------+
|Order_ID|Order_Date|Order_value|
+--------+----------+-----------+
|       1|2024-08-22|        125|
|       2|2024-08-22|        355|
|       3|2024-08-23|       4642|
|       4|2024-08-24|        355|
|       5|2024-08-23|        377|
|       6|2024-08-26|        244|
|       7|2024-08-25|        599|
|       8|2024-08-24|       1200|
|       9|2024-08-25|        450|
|      10|2024-08-22|        960|
+--------+----------+-----------+



In [0]:
df.createOrReplaceTempView("temp_tbl")

sql=""" select *, sum(Order_value) over(rows between unbounded preceding and current row) as running_total from temp_tbl
order by Order_date;
"""

sql2="""with cte as(select Order_Date,sum(order_value)as total from temp_tbl group by Order_Date)
    select Order_Date, sum(total) over(rows between unbounded preceding and current row) as running_total from cte
    ;
    """
spark.sql(sql2).show()

+----------+-------------+
|Order_Date|running_total|
+----------+-------------+
|2024-08-25|         1049|
|2024-08-23|         6068|
|2024-08-24|         7623|
|2024-08-26|         7867|
|2024-08-22|         9307|
+----------+-------------+



In [0]:
from pyspark.sql.window import Window

windowSpec=Window.partitionBy().rowsBetween(Window.unboundedPreceding,Window.currentRow)

df1=df.groupBy(col("Order_Date")).agg(sum(col("Order_value")).alias("Total")).orderBy(col("Order_Date"))
df1.withColumn("Running_Total",sum("Total").over(windowSpec)).show()

+----------+-----+-------------+
|Order_Date|Total|Running_Total|
+----------+-----+-------------+
|2024-08-22| 1440|         1440|
|2024-08-23| 5019|         6459|
|2024-08-24| 1555|         8014|
|2024-08-25| 1049|         9063|
|2024-08-26|  244|         9307|
+----------+-----+-------------+



In [0]:
from pyspark.sql.window import Window

windowSpec=Window.partitionBy().orderBy(col("Order_Date")).rowsBetween(Window.unboundedPreceding,0)
windowTotal=Window.partitionBy("Order_Date")

df1=df.withColumn("Total",sum("Order_value").over(windowTotal))
df2=df1.select("Order_date","Total").distinct()



df3=df2.withColumn("Running_Total",sum("Total").over(windowSpec))
df3.show()

+----------+-----+-------------+
|Order_date|Total|Running_Total|
+----------+-----+-------------+
|2024-08-22| 1440|         1440|
|2024-08-23| 5019|         6459|
|2024-08-24| 1555|         8014|
|2024-08-25| 1049|         9063|
|2024-08-26|  244|         9307|
+----------+-----+-------------+

