In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

In [2]:
#Creating Spark Session (It will executed in the Local Not in the Master Node)
spark=SparkSession.builder.master("local[*]").appName("demo").getOrCreate()
spark

23/03/23 16:55:30 WARN Utils: Your hostname, ZSCHN01LP0253L resolves to a loopback address: 127.0.1.1; using 192.168.229.237 instead (on interface wlp0s20f3)
23/03/23 16:55:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/23 16:55:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/23 16:55:31 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/03/23 16:55:31 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
empDf = spark.read.option("header",True).option("inferSchema",True).csv("../Spark-main/employees.csv")
empDf.printSchema()

root
 |-- EMPLOYEE_ID: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- PHONE_NUMBER: string (nullable = true)
 |-- HIRE_DATE: string (nullable = true)
 |-- JOB_ID: string (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- COMMISSION_PCT: string (nullable = true)
 |-- MANAGER_ID: string (nullable = true)
 |-- DEPARTMENT_ID: integer (nullable = true)



In [4]:
deptDf = spark.read.option("header",True).option("inferSchema",True).csv("../Spark-main/departments.csv")
deptDf.show(5)

+-------------+---------------+----------+-----------+
|DEPARTMENT_ID|DEPARTMENT_NAME|MANAGER_ID|LOCATION_ID|
+-------------+---------------+----------+-----------+
|           10| Administration|       200|       1700|
|           20|      Marketing|       201|       1800|
|           30|     Purchasing|       114|       1700|
|           40|Human Resources|       203|       2400|
|           50|       Shipping|       121|       1500|
+-------------+---------------+----------+-----------+
only showing top 5 rows



### Windows Functions

In [7]:
from pyspark.sql.window import Window
from pyspark.sql.functions import *

windowSpec = Window.partitionBy("DEPARTMENT_ID").orderBy("SALARY")
empDf.withColumn("salary_rank", rank().over(windowSpec)).select("DEPARTMENT_ID","SALARY","salary_rank").show(100)

+-------------+------+-----------+
|DEPARTMENT_ID|SALARY|salary_rank|
+-------------+------+-----------+
|           10|  4400|          1|
|           20|  6000|          1|
|           20| 13000|          2|
|           30|  2500|          1|
|           30|  2600|          2|
|           30|  2800|          3|
|           30|  2900|          4|
|           30|  3100|          5|
|           30| 11000|          6|
|           40|  6500|          1|
|           50|  2100|          1|
|           50|  2200|          2|
|           50|  2200|          2|
|           50|  2400|          4|
|           50|  2400|          4|
|           50|  2500|          6|
|           50|  2500|          6|
|           50|  2600|          8|
|           50|  2600|          8|
|           50|  2700|         10|
|           50|  2700|         10|
|           50|  2800|         12|
|           50|  2900|         13|
|           50|  3200|         14|
|           50|  3200|         14|
|           50|  330

In [8]:
windowSpec = Window.partitionBy("DEPARTMENT_ID").orderBy(col("SALARY").desc())
empDf.withColumn("salary_rank", rank().over(windowSpec)).select("DEPARTMENT_ID","SALARY","salary_rank").show(100)

+-------------+------+-----------+
|DEPARTMENT_ID|SALARY|salary_rank|
+-------------+------+-----------+
|           10|  4400|          1|
|           20| 13000|          1|
|           20|  6000|          2|
|           30| 11000|          1|
|           30|  3100|          2|
|           30|  2900|          3|
|           30|  2800|          4|
|           30|  2600|          5|
|           30|  2500|          6|
|           40|  6500|          1|
|           50|  8200|          1|
|           50|  8000|          2|
|           50|  7900|          3|
|           50|  6500|          4|
|           50|  5800|          5|
|           50|  3600|          6|
|           50|  3300|          7|
|           50|  3300|          7|
|           50|  3200|          9|
|           50|  3200|          9|
|           50|  2900|         11|
|           50|  2800|         12|
|           50|  2700|         13|
|           50|  2700|         13|
|           50|  2600|         15|
|           50|  260

In [9]:
windowSpec = Window.partitionBy("DEPARTMENT_ID").orderBy(col("SALARY").desc())
empDf.withColumn("SUM", sum("SALARY").over(windowSpec)).select("DEPARTMENT_ID","SALARY","SUM").show(100)

+-------------+------+-----+
|DEPARTMENT_ID|SALARY|  SUM|
+-------------+------+-----+
|           10|  4400| 4400|
|           20| 13000|13000|
|           20|  6000|19000|
|           30| 11000|11000|
|           30|  3100|14100|
|           30|  2900|17000|
|           30|  2800|19800|
|           30|  2600|22400|
|           30|  2500|24900|
|           40|  6500| 6500|
|           50|  8200| 8200|
|           50|  8000|16200|
|           50|  7900|24100|
|           50|  6500|30600|
|           50|  5800|36400|
|           50|  3600|40000|
|           50|  3300|46600|
|           50|  3300|46600|
|           50|  3200|53000|
|           50|  3200|53000|
|           50|  2900|55900|
|           50|  2800|58700|
|           50|  2700|64100|
|           50|  2700|64100|
|           50|  2600|69300|
|           50|  2600|69300|
|           50|  2500|74300|
|           50|  2500|74300|
|           50|  2400|79100|
|           50|  2400|79100|
|           50|  2200|83500|
|           50

In [10]:
windowSpec = Window.partitionBy("DEPARTMENT_ID")
empDf.withColumn("SUM", sum("SALARY").over(windowSpec)).select("DEPARTMENT_ID","SALARY","SUM").show(100)

+-------------+------+-----+
|DEPARTMENT_ID|SALARY|  SUM|
+-------------+------+-----+
|           10|  4400| 4400|
|           20| 13000|19000|
|           20|  6000|19000|
|           30| 11000|24900|
|           30|  3100|24900|
|           30|  2900|24900|
|           30|  2800|24900|
|           30|  2600|24900|
|           30|  2500|24900|
|           40|  6500| 6500|
|           50|  2600|85600|
|           50|  2600|85600|
|           50|  8000|85600|
|           50|  8200|85600|
|           50|  7900|85600|
|           50|  6500|85600|
|           50|  5800|85600|
|           50|  3200|85600|
|           50|  2700|85600|
|           50|  2400|85600|
|           50|  2200|85600|
|           50|  3300|85600|
|           50|  2800|85600|
|           50|  2500|85600|
|           50|  2100|85600|
|           50|  3300|85600|
|           50|  2900|85600|
|           50|  2400|85600|
|           50|  2200|85600|
|           50|  3600|85600|
|           50|  3200|85600|
|           50