In [1]:
import findspark
findspark.init()
findspark.find()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("Aggregate operations").master("local[3]").getOrCreate()

In [3]:
#-------------------------------------------------------------------------

simpleData = [
    ("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData,schema=schema)
df.printSchema()
df.show(truncate=False)

#-------------------------------------------------------------------------

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |null  |
+-------------+----------+------+



In [None]:
#-------------------------------------------------------------------------

# approx_count_distinct(): approx count of distinct items in a group 

df.select(approx_count_distinct("salary")).show() 

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# avg(): calculate average

df.select(avg("salary")).show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# collect_list(): collect all the values of input column in an array with duplicates

df1 = df.select(collect_list("salary"))
df1.printSchema()
df1.show(truncate=False)

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# collect_set(): collect all the values of input column in an array without duplicates

df1 = df.select(collect_set("salary"))
df1.printSchema()
df1.show(truncate=False)

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# countDistinct(): number of distinct elements in a columns

df1 = df.select(countDistinct("department", "salary"))
df1.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# count(): number of elements in a column

df1 = df.select(count("salary"))
df1.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# first(col, ignorenulls=False): 

"""
returns the first value in a group.
The function by default returns the first values it sees. 
It will return the first non-null value it sees when ignoreNulls is set to true. 
If all values are null, then null is returned.
"""

df1 = df.select(first("salary"))
df1.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# last(col, ignorenulls=False)

"""
returns the last value in a group.
The function by default returns the last values it sees. 
It will return the last non-null value it sees when ignoreNulls is set to true. 
If all values are null, then null is returned.
"""

df1 = df.select(last(df.salary))
df1.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# max(): get max value

df.select(max(df.salary)).show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# min(): get min value

df.select(min(df.salary)).show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# mean(): get avg or mean value.
# alias for avg()

df.select(mean("salary")).show()

#-------------------------------------------------------------------------

In [4]:
#-------------------------------------------------------------------------

# sum(): get sum

df.select(sum(df.salary)).show()

#-------------------------------------------------------------------------

+-----------+
|sum(salary)|
+-----------+
|      29900|
+-----------+



In [None]:
#-------------------------------------------------------------------------

# sumDistinct(): get sum of all the distinct values

df.select(sumDistinct(df.salary)).show()

#-------------------------------------------------------------------------