In [1]:

import os


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
spark = SparkSession.builder.master("local[*]") \
    .appName("testing") \
    .config("spark.driver.extraClassPath", "C:\\my_sql_jar\\mysql-connector-java-8.0.26.jar") \
    .getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001919949D700>


In [3]:
emp_data = [(1,'manish',50000,'IT','m'),
(2,'vikash',60000,'sales','m'),
(3,'raushan',70000,'marketing','m'),
(4,'mukesh',80000,'IT','m'),
(5,'priti',90000,'sales','f'),
(6,'nikita',45000,'marketing','f'),
(7,'ragini',55000,'marketing','f'),
(8,'rashi',100000,'IT','f'),
(9,'aditya',65000,'IT','m'),
(10,'rahul',50000,'marketing','m'),
(11,'rakhi',50000,'IT','f'),
(12,'akhilesh',90000,'sales','m')]

emp_schema = ['id', 'name', 'sal', 'dept', 'gender']

emp_df = spark.createDataFrame(data = emp_data, schema=emp_schema)

In [4]:
emp_df.show()

+---+--------+------+---------+------+
| id|    name|   sal|     dept|gender|
+---+--------+------+---------+------+
|  1|  manish| 50000|       IT|     m|
|  2|  vikash| 60000|    sales|     m|
|  3| raushan| 70000|marketing|     m|
|  4|  mukesh| 80000|       IT|     m|
|  5|   priti| 90000|    sales|     f|
|  6|  nikita| 45000|marketing|     f|
|  7|  ragini| 55000|marketing|     f|
|  8|   rashi|100000|       IT|     f|
|  9|  aditya| 65000|       IT|     m|
| 10|   rahul| 50000|marketing|     m|
| 11|   rakhi| 50000|       IT|     f|
| 12|akhilesh| 90000|    sales|     m|
+---+--------+------+---------+------+



In [7]:
emp_df.groupBy("dept").agg(sum(col("sal"))).show()

+---------+--------+
|     dept|sum(sal)|
+---------+--------+
|marketing|  220000|
|    sales|  240000|
|       IT|  345000|
+---------+--------+



In [8]:
# window function - row_number()

window = Window.partitionBy("dept").orderBy("sal")

emp_df.withColumn("rn", row_number().over(window)).show()

+---+--------+------+---------+------+---+
| id|    name|   sal|     dept|gender| rn|
+---+--------+------+---------+------+---+
|  1|  manish| 50000|       IT|     m|  1|
| 11|   rakhi| 50000|       IT|     f|  2|
|  9|  aditya| 65000|       IT|     m|  3|
|  4|  mukesh| 80000|       IT|     m|  4|
|  8|   rashi|100000|       IT|     f|  5|
|  6|  nikita| 45000|marketing|     f|  1|
| 10|   rahul| 50000|marketing|     m|  2|
|  7|  ragini| 55000|marketing|     f|  3|
|  3| raushan| 70000|marketing|     m|  4|
|  2|  vikash| 60000|    sales|     m|  1|
|  5|   priti| 90000|    sales|     f|  2|
| 12|akhilesh| 90000|    sales|     m|  3|
+---+--------+------+---------+------+---+



In [16]:
emp_df.schema

StructType([StructField('id', LongType(), True), StructField('name', StringType(), True), StructField('sal', LongType(), True), StructField('dept', StringType(), True), StructField('gender', StringType(), True)])

In [19]:
# Rank, Dense_rank, Sum

window = Window.partitionBy("dept").orderBy(desc("sal"))

emp_df.withColumn("rnk", rank().over(window))\
        .withColumn("dense_rnk", dense_rank().over(window))\
        .withColumn("sum", sum("sal").over(window))\
        .show()

+---+--------+------+---------+------+---+---------+------+
| id|    name|   sal|     dept|gender|rnk|dense_rnk|   sum|
+---+--------+------+---------+------+---+---------+------+
|  8|   rashi|100000|       IT|     f|  1|        1|100000|
|  4|  mukesh| 80000|       IT|     m|  2|        2|180000|
|  9|  aditya| 65000|       IT|     m|  3|        3|245000|
|  1|  manish| 50000|       IT|     m|  4|        4|345000|
| 11|   rakhi| 50000|       IT|     f|  4|        4|345000|
|  3| raushan| 70000|marketing|     m|  1|        1| 70000|
|  7|  ragini| 55000|marketing|     f|  2|        2|125000|
| 10|   rahul| 50000|marketing|     m|  3|        3|175000|
|  6|  nikita| 45000|marketing|     f|  4|        4|220000|
|  5|   priti| 90000|    sales|     f|  1|        1|180000|
| 12|akhilesh| 90000|    sales|     m|  1|        1|180000|
|  2|  vikash| 60000|    sales|     m|  3|        2|240000|
+---+--------+------+---------+------+---+---------+------+



In [25]:
# Top 2 salary in each dept
window = Window.partitionBy("dept").orderBy(desc("sal"))

emp_df.withColumn("top", dense_rank().over(window))\
      .filter((col("top") <= 2))\
        .show()

+---+--------+------+---------+------+---+
| id|    name|   sal|     dept|gender|top|
+---+--------+------+---------+------+---+
|  8|   rashi|100000|       IT|     f|  1|
|  4|  mukesh| 80000|       IT|     m|  2|
|  3| raushan| 70000|marketing|     m|  1|
|  7|  ragini| 55000|marketing|     f|  2|
|  5|   priti| 90000|    sales|     f|  1|
| 12|akhilesh| 90000|    sales|     m|  1|
|  2|  vikash| 60000|    sales|     m|  2|
+---+--------+------+---------+------+---+

