# GroupBy and Aggregate Functions

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Aggs").getOrCreate()

In [None]:
df = spark.read.csv("/data/sales_info.csv", inferSchema=True, header=True)

In [None]:
df.show()

In [None]:
df.printSchema()

In [None]:
# groupBy returns GroupedData object (like python .groupby)
df.groupBy("Company")

In [None]:
# Calling .mean method on groupeddata object returns a datafame
df.groupBy("Company").min().show()

In [None]:
df.groupBy("Company").count().show()

https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.GroupedData.html

In [None]:
# Can also call agg functions without grouping first (e.g. to get column totals)
df.agg({"Sales":"max"}).show(), df.agg({"Sales":"sum"}).show()

In [None]:
group_data = df.groupBy("Company")

In [None]:
group_data.agg({"Sales":"max"}).show()

In [None]:
# Import useful functions from pyspark
from pyspark.sql.functions import countDistinct, avg, stddev

In [None]:
# Pass a pyspark.sql function to df.select
df.select(countDistinct('Sales')).show()

In [None]:
# Can provide an alias for output column
df.select(stddev('Sales').alias("StdDev Sales")).show()

In [None]:
# Can format numbers with pyspark.sql function format_number()
from pyspark.sql.functions import format_number

In [None]:
sales_std = df.select(stddev('Sales').alias("StdDev Sales"))

In [None]:
sales_std.select(format_number("StdDev Sales", 2).alias("StdDev Sales")).show()

In [None]:
# Ordering and Sorting
df.orderBy("Sales", ascending=False).show()

In [None]:
df.orderBy(df["Sales"].desc()).show()