<a href="https://colab.research.google.com/github/rahulrajpr/prepare-anytime/blob/main/spark/functions/14_spark_sql_window_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Spark Window Functions**
https://spark.apache.org/docs/latest/sql-ref-functions-builtin.html#window-functions

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark-functions').getOrCreate()

In [31]:
from pyspark.sql.functions import to_date, desc, rank
from pyspark.sql.types import StructType,StructField, IntegerType, StringType

In [13]:

data = [
    ("2023-01-01", "Alice", "Electronics", 1000),
    ("2023-01-01", "Bob", "Electronics", 1200),
    ("2023-01-01", "Charlie", "Clothing", 800),
    ("2023-01-01", "Diana", "Electronics", 1500),
    ("2023-01-01", "Eve", "Clothing", 600),
    ("2023-02-01", "Alice", "Electronics", 1100),
    ("2023-02-01", "Bob", "Electronics", 900),
    ("2023-02-01", "Charlie", "Clothing", 950),
    ("2023-02-01", "Diana", "Electronics", 1300),
    ("2023-02-01", "Eve", "Clothing", 700),
    ("2023-02-01", "Frank", "Electronics", None),
    ("2023-03-01", "Alice", "Electronics", 1400),
    ("2023-03-01", "Bob", "Electronics", 1000),
    ("2023-03-01", "Charlie", "Clothing", 1200),
    ("2023-03-01", "Diana", "Electronics", 1600)
]

# Define schema
schema = StructType([
    StructField("mn", StringType(), True),
    StructField("salesperson", StringType(), True),
    StructField("department", StringType(), True),
    StructField("sales", IntegerType(), True)
])

dataframe = spark.createDataFrame(data, schema)
dataframe = dataframe.withColumn('mn',to_date(col('mn'), 'yyyy-MM-dd'))
dataframe.printSchema()
dataframe.show(truncate = False)

root
 |-- mn: date (nullable = true)
 |-- salesperson: string (nullable = true)
 |-- department: string (nullable = true)
 |-- sales: integer (nullable = true)

+----------+-----------+-----------+-----+
|mn        |salesperson|department |sales|
+----------+-----------+-----------+-----+
|2023-01-01|Alice      |Electronics|1000 |
|2023-01-01|Bob        |Electronics|1200 |
|2023-01-01|Charlie    |Clothing   |800  |
|2023-01-01|Diana      |Electronics|1500 |
|2023-01-01|Eve        |Clothing   |600  |
|2023-02-01|Alice      |Electronics|1100 |
|2023-02-01|Bob        |Electronics|900  |
|2023-02-01|Charlie    |Clothing   |950  |
|2023-02-01|Diana      |Electronics|1300 |
|2023-02-01|Eve        |Clothing   |700  |
|2023-02-01|Frank      |Electronics|NULL |
|2023-03-01|Alice      |Electronics|1400 |
|2023-03-01|Bob        |Electronics|1000 |
|2023-03-01|Charlie    |Clothing   |1200 |
|2023-03-01|Diana      |Electronics|1600 |
+----------+-----------+-----------+-----+



In [14]:
dataframe.createOrReplaceTempView('dataframe_view')

In [23]:
# rank

# saprk sqkl

sql = '''
with cte as
(
  select *, rank() over(partition by department order by sales desc NULLS LAST) as rn
  from dataframe_view
)
select *
from cte
where rn = 1
'''
spark.sql(sql).show(truncate = False)

+----------+-----------+-----------+-----+---+
|mn        |salesperson|department |sales|rn |
+----------+-----------+-----------+-----+---+
|2023-03-01|Charlie    |Clothing   |1200 |1  |
|2023-03-01|Diana      |Electronics|1600 |1  |
+----------+-----------+-----------+-----+---+



In [33]:
from pyspark.sql.window import Window

In [37]:
win = Window.partitionBy('department').orderBy(desc('sales'))
dataframe.withColumn('rn',rank().over(win)).filter('rn == 1').show()

+----------+-----------+-----------+-----+---+
|        mn|salesperson| department|sales| rn|
+----------+-----------+-----------+-----+---+
|2023-03-01|    Charlie|   Clothing| 1200|  1|
|2023-03-01|      Diana|Electronics| 1600|  1|
+----------+-----------+-----------+-----+---+

