-----------------------------------------------------
Created on: 03-09-2022                              
Author: Rohit Sharma                                
                                                     
-----------------------------------------------------

# PySpark with Python GroupBy and Aggregate function


In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Agg").getOrCreate()

In [3]:
spark

In [10]:
# read dataset
df_spark = spark.read.csv("test4.csv", header=True, inferSchema=True)
df_spark.show()

+-------+------------+------+
|   Name| Departments|Salary|
+-------+------------+------+
|    Ram|Data Science|  1000|
|  Rohan|         IOT| 30000|
|  Mohan|    Big data| 40000|
|    Raj|          AI| 20000|
|   Neha|    Big data| 20000|
|Prateek|         IOT| 50000|
|  Mukul|Data Science| 30000|
+-------+------------+------+



In [11]:
# show schema
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [12]:
# GroupBy
# grouped to find the maximum salary
df_spark.groupBy('Name').sum().show()

+-------+-----------+
|   Name|sum(Salary)|
+-------+-----------+
|Prateek|      50000|
|  Mohan|      40000|
|  Mukul|      30000|
|    Ram|       1000|
|  Rohan|      30000|
|    Raj|      20000|
|   Neha|      20000|
+-------+-----------+



In [13]:
# GroupBy Departments
df_spark.groupBy('Departments').sum().show()

+------------+-----------+
| Departments|sum(Salary)|
+------------+-----------+
|         IOT|      80000|
|          AI|      20000|
|Data Science|      31000|
|    Big data|      60000|
+------------+-----------+



In [14]:
# find mean
df_spark.groupBy('Departments').mean().show()

+------------+-----------+
| Departments|avg(Salary)|
+------------+-----------+
|         IOT|    40000.0|
|          AI|    20000.0|
|Data Science|    15500.0|
|    Big data|    30000.0|
+------------+-----------+



In [15]:
# how many numbers of employee in a department
df_spark.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|          AI|    1|
|Data Science|    2|
|    Big data|    2|
+------------+-----+



In [16]:
# find total salary or expenditure using aggregate function
df_spark.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|     191000|
+-----------+

