In [62]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, sum
import pandas as pd
import numpy as np

In [67]:
spark = SparkSession.builder.appName("sparkWindowFunctionsExample").getOrCreate()
data = [(1,100,1,2022),(1,700,2,2022),(1,600,2,2022),(2,1000,3,2022),(2,1100,1,2022),(3,400,1,2021),(3,500,1,2021)]
df = spark.createDataFrame(data,schema=["id","salary","month","year"])
df.createOrReplaceTempView("df")
window_1=Window.partitionBy(["year","month"]).orderBy("salary")
df=df.withColumn("ytd_sum",sum("salary").over(window_1)).drop("id")
df.show()

+------+-----+----+-------+
|salary|month|year|ytd_sum|
+------+-----+----+-------+
|   400|    1|2021|    400|
|   500|    1|2021|    900|
|   100|    1|2022|    100|
|  1100|    1|2022|   1200|
|   600|    2|2022|    600|
|   700|    2|2022|   1300|
|  1000|    3|2022|   1000|
+------+-----+----+-------+



In [68]:
df2=spark.sql("""SELECT
    salary,
    month,
    year,
    SUM(salary) OVER (PARTITION BY year,month ORDER BY salary) AS ytd_sum
FROM df
ORDER BY year, month;""")
df2.show()

+------+-----+----+-------+
|salary|month|year|ytd_sum|
+------+-----+----+-------+
|   400|    1|2021|    400|
|   500|    1|2021|    900|
|   100|    1|2022|    100|
|  1100|    1|2022|   1200|
|   600|    2|2022|    600|
|   700|    2|2022|   1300|
|  1000|    3|2022|   1000|
+------+-----+----+-------+



In [71]:
df_dict={
    "id":[1,1,1,2,2,3,3],
    "salary":[100,700,600,1000,1100,400,500],
    "month":["1","2","2","3","1","1","1"],
    "year":[2022,2022,2022,2022,2022,2021,2021]
    }

df=pd.DataFrame(df_dict)
display(df)
df["YTD Sum"]=df[["year","month","salary"]].groupby(["year","month"])["salary"].cumsum()
df.sort_values(["year","month"]).drop("id",axis=1)

Unnamed: 0,id,salary,month,year
0,1,100,1,2022
1,1,700,2,2022
2,1,600,2,2022
3,2,1000,3,2022
4,2,1100,1,2022
5,3,400,1,2021
6,3,500,1,2021


Unnamed: 0,salary,month,year,YTD Sum
5,400,1,2021,400
6,500,1,2021,900
0,100,1,2022,100
4,1100,1,2022,1200
1,700,2,2022,700
2,600,2,2022,1300
3,1000,3,2022,1000


In [4]:
def find_tuples_above_average(list_1):
   average_salary = sum(salary for _, salary in list_1) / len(list_1)
   return [(name, salary) for name, salary in list_1 if salary > average_salary]

# Example usage
list_1 = [("sanjeev", 10000), ("sam", 20000), ("pat", 30000)]
high_earners = find_tuples_above_average(list_1)
print(high_earners)


[('pat', 30000)]


In [5]:
for i in list_1:
    print(i)

('sanjeev', 10000)
('sam', 20000)
('pat', 30000)
