In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import *
spark = SparkSession.builder.appName('Windows Function').getOrCreate()

In [2]:
spark

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# sample data for dataframe
sampleData = (("Ram", 28, "Sales", 3000),
              ("Meena", 33, "Sales", 4600),
              ("Robin", 40, "Sales", 4100),
              ("Kunal", 25, "Finance", 3000),
              ("Sneha", 36, "Finance", 10000),
              ("Ram", 28, "Sales", 3000),
              ("Srishti", 46, "Management", 3300),
              ("Jeny", 26, "Finance", 3900),
              ("Hitesh", 30, "Marketing", 3000),
              ("Kailash", 29, "Marketing", 2000),
              ("Rakesh", 38, "Marketing", 7000),
              ("Sharad", 39, "Sales", 4100)
              )
 
schema = StructType([
            StructField("Name", StringType(), False),
            StructField("Age",  IntegerType(), False),
            StructField("Department", StringType(), False),
            StructField("Salary", IntegerType(), False)
            ])

df = spark.createDataFrame(sampleData,schema)
df = df.orderBy(col('Department'), desc(col('Salary')))
df.show()

+-------+---+----------+------+
|   Name|Age|Department|Salary|
+-------+---+----------+------+
|  Sneha| 36|   Finance| 10000|
|   Jeny| 26|   Finance|  3900|
|  Kunal| 25|   Finance|  3000|
|Srishti| 46|Management|  3300|
| Rakesh| 38| Marketing|  7000|
| Hitesh| 30| Marketing|  3000|
|Kailash| 29| Marketing|  2000|
|  Meena| 33|     Sales|  4600|
| Sharad| 39|     Sales|  4100|
|  Robin| 40|     Sales|  4100|
|    Ram| 28|     Sales|  3000|
|    Ram| 28|     Sales|  3000|
+-------+---+----------+------+



In [4]:
# 1) Find the highest salary of each department
max_sal_dept = df.groupBy(col('Department')).agg(max(col('Salary')))
max_sal_dept.show()

+----------+-----------+
|Department|max(Salary)|
+----------+-----------+
|     Sales|       4600|
|   Finance|      10000|
|Management|       3300|
| Marketing|       7000|
+----------+-----------+



In [5]:
#2) How to get entire row
windowspec = Window.partitionBy('Department').orderBy(df['Salary'].desc())
df_with_row_number = df.withColumn('Row_number', row_number().over(windowspec))

max_salary_dept = df_with_row_number.filter(df_with_row_number['Row_number'] == 1)
max_salary_dept.show()

+-------+---+----------+------+----------+
|   Name|Age|Department|Salary|Row_number|
+-------+---+----------+------+----------+
|  Sneha| 36|   Finance| 10000|         1|
|Srishti| 46|Management|  3300|         1|
| Rakesh| 38| Marketing|  7000|         1|
|  Meena| 33|     Sales|  4600|         1|
+-------+---+----------+------+----------+



In [6]:
#3) How to get first two highest salary

max2_salary_dept = df_with_row_number.filter(df_with_row_number['Row_number'] <= 2).drop('Row_number')
max2_salary_dept.show()

+-------+---+----------+------+
|   Name|Age|Department|Salary|
+-------+---+----------+------+
|  Sneha| 36|   Finance| 10000|
|   Jeny| 26|   Finance|  3900|
|Srishti| 46|Management|  3300|
| Rakesh| 38| Marketing|  7000|
| Hitesh| 30| Marketing|  3000|
|  Meena| 33|     Sales|  4600|
|  Robin| 40|     Sales|  4100|
+-------+---+----------+------+



In [7]:
#4) How to get second highest salary

top2nd_salary_dept = df_with_row_number.filter(df_with_row_number['Row_number'] == 2)
top2nd_salary_dept.drop('Row_number').show()

+------+---+----------+------+
|  Name|Age|Department|Salary|
+------+---+----------+------+
|  Jeny| 26|   Finance|  3900|
|Hitesh| 30| Marketing|  3000|
| Robin| 40|     Sales|  4100|
+------+---+----------+------+



In [11]:
# find how far the salary is from avergae salary of each department

df.registerTempTable('temp_data')



In [28]:
df_average_sal = df.groupBy('Department').agg(round(avg(col('Salary')),0).alias('avg_salary'))
df_average_sal = df_average_sal.withColumnRenamed('Department', 'avg_department')
df_join_avg = df.join(df_average_sal, df['Department'] == df_average_sal['avg_department'], 'inner').drop('avg_department')
diff_sal_avg_sal = df_join_avg.withColumn('differnce_sal_avg_sal', (col('Salary') - col('avg_salary')))
diff_sal_avg_sal.show()

+-------+---+----------+------+----------+---------------------+
|   Name|Age|Department|Salary|avg_salary|differnce_sal_avg_sal|
+-------+---+----------+------+----------+---------------------+
|    Ram| 28|     Sales|  3000|    3760.0|               -760.0|
|  Meena| 33|     Sales|  4600|    3760.0|                840.0|
|  Robin| 40|     Sales|  4100|    3760.0|                340.0|
|    Ram| 28|     Sales|  3000|    3760.0|               -760.0|
|  Kunal| 25|   Finance|  3000|    5633.0|              -2633.0|
|  Sneha| 36|   Finance| 10000|    5633.0|               4367.0|
|Srishti| 46|Management|  3300|    3300.0|                  0.0|
|   Jeny| 26|   Finance|  3900|    5633.0|              -1733.0|
| Hitesh| 30| Marketing|  3000|    4000.0|              -1000.0|
| Sharad| 39|     Sales|  4100|    3760.0|                340.0|
|Kailash| 29| Marketing|  2000|    4000.0|              -2000.0|
| Rakesh| 38| Marketing|  7000|    4000.0|               3000.0|
+-------+---+----------+-

In [42]:
# find the rank and dense rank
# The rank function assigns a unique rank to each row based on the specified ordering. 
# If multiple rows have the same "Salary," they will get the same rank, and the next rank will be skipped.

#The dense_rank function is similar to rank but doesn't skip ranks in case of ties. If multiple rows have the same "Salary,"
#they will get the same rank, and the next rank will continue without skipping.

#Lag -This column contains the salary of the previous row within each department based on the ascending order of salaries.
#Lead - This column contains the salary of the next row within each department based on the descending order of salaries.

from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# sample data for dataframe
sampleData = (("Ram", 28, "Sales", 3000),
              ("Meena", 33, "Sales", 3000),
              ("Robin", 40, "Sales", 4100),
              ("Kunal", 25, "Finance", 3000),
              ("Sneha", 36, "Finance", 10000),
              ("Rama", 28, "Sales", 1000),
              ("Srishti", 46, "Management", 3300),
              ("Jeny", 26, "Finance", 3900),
              ("Hitesh", 30, "Marketing", 3000),
              ("Kailash", 29, "Marketing", 2000),
              ("Rakesh", 38, "Marketing", 7000),
              ("Sharad", 39, "Sales", 4100)
              )
 
schema = StructType([
            StructField("Name", StringType(), False),
            StructField("Age",  IntegerType(), False),
            StructField("Department", StringType(), False),
            StructField("Salary", IntegerType(), False)
            ])

df = spark.createDataFrame(sampleData,schema)
df = df.orderBy(col('Department'), desc(col('Salary')))
df.show()

windowspec = Window.partitionBy('Department').orderBy(df['Salary'].desc())
df = df \
    .withColumn('Rank', rank().over(windowspec)) \
    .withColumn('DenseRank', dense_rank().over(windowspec)) \
    .withColumn('Lag', lag('Salary').over(windowspec)) \
    .withColumn('Lead', lead('Salary').over(windowspec)) \
    .withColumn('Cummulative_Sum', sum('Salary').over(windowspec)) \
    .withColumn('Cummulative_distibution', cume_dist().over(windowspec)) \
    .withColumn("PercentRank", percent_rank().over(windowSpec))
    
df.show()



+-------+---+----------+------+
|   Name|Age|Department|Salary|
+-------+---+----------+------+
|  Sneha| 36|   Finance| 10000|
|   Jeny| 26|   Finance|  3900|
|  Kunal| 25|   Finance|  3000|
|Srishti| 46|Management|  3300|
| Rakesh| 38| Marketing|  7000|
| Hitesh| 30| Marketing|  3000|
|Kailash| 29| Marketing|  2000|
| Sharad| 39|     Sales|  4100|
|  Robin| 40|     Sales|  4100|
|    Ram| 28|     Sales|  3000|
|  Meena| 33|     Sales|  3000|
|   Rama| 28|     Sales|  1000|
+-------+---+----------+------+

+-------+---+----------+------+----+---------+-----+----+---------------+-----------------------+-----------+
|   Name|Age|Department|Salary|Rank|DenseRank|  Lag|Lead|Cummulative_Sum|Cummulative_distibution|PercentRank|
+-------+---+----------+------+----+---------+-----+----+---------------+-----------------------+-----------+
|  Kunal| 25|   Finance|  3000|   3|        3| 3900|null|          16900|                    1.0|        0.0|
|   Jeny| 26|   Finance|  3900|   2|        2|1