# GroupedData.agg

- min, max, sum (total), mean or avg
- Compute aggregates and returns the result as a DataFrame.
- built-in aggregation functions, such as avg, max, min, sum, count

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv("house.csv", inferSchema=True,
                   header=True)

In [4]:
df.show()

+-------+-----------+---------+----------+---------+
|   City|OverallQual|GrLivArea|GarageArea|SalePrice|
+-------+-----------+---------+----------+---------+
| Mumbai|          7|     1710|       548|   208500|
|  Delhi|          6|     1262|       460|   181500|
|Kolkata|          7|     1786|       608|   223500|
|   Pune|          7|     1717|       642|   140000|
|Lucknow|          8|     2198|       836|   250000|
| Bhopal|          5|     1362|       480|   143000|
|Lucknow|          8|     1694|       636|   307000|
| Bhopal|          7|     2090|       484|   200000|
| Bhopal|          7|     1774|       468|   129900|
|Kolkata|          5|     1077|       205|   118000|
|Lucknow|          5|     1040|       384|   129500|
|  Delhi|          9|     2324|       736|   345000|
| Mumbai|          5|      912|       352|   144000|
| Mumbai|          7|     1494|       840|   279500|
| Bhopal|          6|     1253|       352|   157000|
| Mumbai|          7|      854|       576|   1

In [8]:
df.select("city").distinct().show()

+-------+
|   city|
+-------+
|Lucknow|
| Mumbai|
|Kolkata|
|   Pune|
|  Delhi|
| Bhopal|
+-------+



In [5]:
gdf = df.groupBy(df.City)

In [6]:
gdf

<pyspark.sql.group.GroupedData at 0x2d1703c8190>

In [7]:
gdf.agg(F.min(df.SalePrice)).show()

+-------+--------------+
|   City|min(SalePrice)|
+-------+--------------+
|Lucknow|         34900|
| Mumbai|         62383|
|Kolkata|         60000|
|   Pune|         76000|
|  Delhi|         82000|
| Bhopal|         40000|
+-------+--------------+



In [9]:
gdf.agg(F.min(df.SalePrice).alias("Min_Sale_Price")).show()

+-------+--------------+
|   City|Min_Sale_Price|
+-------+--------------+
|Lucknow|         34900|
| Mumbai|         62383|
|Kolkata|         60000|
|   Pune|         76000|
|  Delhi|         82000|
| Bhopal|         40000|
+-------+--------------+



In [11]:
gdf.agg(F.min(df.GrLivArea)).show()

+-------+--------------+
|   City|min(GrLivArea)|
+-------+--------------+
|Lucknow|           720|
| Mumbai|           630|
|Kolkata|           803|
|   Pune|           784|
|  Delhi|           694|
| Bhopal|           520|
+-------+--------------+



In [12]:
gdf.agg(F.max(df.SalePrice)).show()

+-------+--------------+
|   City|max(SalePrice)|
+-------+--------------+
|Lucknow|        475000|
| Mumbai|        555000|
|Kolkata|        438780|
|   Pune|        374000|
|  Delhi|        426000|
| Bhopal|        430000|
+-------+--------------+



In [13]:
gdf.agg(F.max(df.SalePrice).alias("Max_Sale_Price")).show()

+-------+--------------+
|   City|Max_Sale_Price|
+-------+--------------+
|Lucknow|        475000|
| Mumbai|        555000|
|Kolkata|        438780|
|   Pune|        374000|
|  Delhi|        426000|
| Bhopal|        430000|
+-------+--------------+



In [14]:
gdf.agg(F.max(df.GrLivArea)).show()

+-------+--------------+
|   City|max(GrLivArea)|
+-------+--------------+
|Lucknow|          3608|
| Mumbai|          2646|
|Kolkata|          3222|
|   Pune|          3112|
|  Delhi|          2794|
| Bhopal|          3493|
+-------+--------------+



In [15]:
gdf.agg(F.sum(df.SalePrice)).show()

+-------+--------------+
|   City|sum(SalePrice)|
+-------+--------------+
|Lucknow|      15449311|
| Mumbai|      20616495|
|Kolkata|      13597272|
|   Pune|       6436340|
|  Delhi|      15233889|
| Bhopal|      19925195|
+-------+--------------+



In [16]:
gdf.agg(F.max(df.SalePrice).alias("Total_Sale_Price")).show()

+-------+----------------+
|   City|Total_Sale_Price|
+-------+----------------+
|Lucknow|          475000|
| Mumbai|          555000|
|Kolkata|          438780|
|   Pune|          374000|
|  Delhi|          426000|
| Bhopal|          430000|
+-------+----------------+



In [17]:
gdf.agg(F.mean(df.SalePrice)).show()

+-------+------------------+
|   City|    avg(SalePrice)|
+-------+------------------+
|Lucknow|190732.23456790124|
| Mumbai|176209.35897435897|
|Kolkata| 183746.9189189189|
|   Pune|165034.35897435897|
|  Delhi|195306.26923076922|
| Bhopal|179506.26126126127|
+-------+------------------+



In [19]:
# mean and average are same you can use anyone

In [18]:
gdf.agg(F.avg(df.SalePrice)).show()

+-------+------------------+
|   City|    avg(SalePrice)|
+-------+------------------+
|Lucknow|190732.23456790124|
| Mumbai|176209.35897435897|
|Kolkata| 183746.9189189189|
|   Pune|165034.35897435897|
|  Delhi|195306.26923076922|
| Bhopal|179506.26126126127|
+-------+------------------+



In [20]:
gdf.agg(F.count(df.SalePrice)).show()

+-------+----------------+
|   City|count(SalePrice)|
+-------+----------------+
|Lucknow|              81|
| Mumbai|             117|
|Kolkata|              74|
|   Pune|              39|
|  Delhi|              78|
| Bhopal|             111|
+-------+----------------+



In [None]:
df.select(F.coalesce(df["city"], df["SalePrice"])).show()

In [10]:
df.select(F.coalesce(df["city"], df["SalePrice"])).show()

+-------------------------+
|coalesce(city, SalePrice)|
+-------------------------+
|                   Mumbai|
|                    Delhi|
|                  Kolkata|
|                     Pune|
|                  Lucknow|
|                   Bhopal|
|                  Lucknow|
|                   Bhopal|
|                   Bhopal|
|                  Kolkata|
|                  Lucknow|
|                    Delhi|
|                   Mumbai|
|                   Mumbai|
|                   Bhopal|
|                   Mumbai|
|                    Delhi|
|                  Kolkata|
|                     Pune|
|                  Lucknow|
+-------------------------+
only showing top 20 rows



In [14]:
df.agg(F.collect_list('city')).show()

+--------------------+
|  collect_list(city)|
+--------------------+
|[Mumbai, Delhi, K...|
+--------------------+



In [13]:
df.agg(F.collect_list('city')).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
df.select(F.concat(df["city"], df["SalePrice"]).alias('city_price')).show()

+-------------+
|   city_price|
+-------------+
| Mumbai208500|
|  Delhi181500|
|Kolkata223500|
|   Pune140000|
|Lucknow250000|
| Bhopal143000|
|Lucknow307000|
| Bhopal200000|
| Bhopal129900|
|Kolkata118000|
|Lucknow129500|
|  Delhi345000|
| Mumbai144000|
| Mumbai279500|
| Bhopal157000|
| Mumbai132000|
|  Delhi149000|
| Kolkata90000|
|   Pune159000|
|Lucknow139000|
+-------------+
only showing top 20 rows



In [23]:
df.agg(F.countDistinct('city')).show()

+-----------+
|count(city)|
+-----------+
|          6|
+-----------+



In [24]:
df.select("city").distinct().show()

+-------+
|   city|
+-------+
|Lucknow|
| Mumbai|
|Kolkata|
|   Pune|
|  Delhi|
| Bhopal|
+-------+



In [28]:
df.select("City").show()

+-------+
|   City|
+-------+
| Mumbai|
|  Delhi|
|Kolkata|
|   Pune|
|Lucknow|
| Bhopal|
|Lucknow|
| Bhopal|
| Bhopal|
|Kolkata|
|Lucknow|
|  Delhi|
| Mumbai|
| Mumbai|
| Bhopal|
| Mumbai|
|  Delhi|
|Kolkata|
|   Pune|
|Lucknow|
+-------+
only showing top 20 rows



In [27]:
df.select(F.length("City")).show()

+------------+
|length(City)|
+------------+
|           6|
|           5|
|           7|
|           4|
|           7|
|           6|
|           7|
|           6|
|           6|
|           7|
|           7|
|           5|
|           6|
|           6|
|           6|
|           6|
|           5|
|           7|
|           4|
|           7|
+------------+
only showing top 20 rows



In [29]:
from pyspark.sql.functions import col,lit

In [30]:
df.select(col("City"),lit("1").alias("lit_value1")).show()

+-------+----------+
|   City|lit_value1|
+-------+----------+
| Mumbai|         1|
|  Delhi|         1|
|Kolkata|         1|
|   Pune|         1|
|Lucknow|         1|
| Bhopal|         1|
|Lucknow|         1|
| Bhopal|         1|
| Bhopal|         1|
|Kolkata|         1|
|Lucknow|         1|
|  Delhi|         1|
| Mumbai|         1|
| Mumbai|         1|
| Bhopal|         1|
| Mumbai|         1|
|  Delhi|         1|
|Kolkata|         1|
|   Pune|         1|
|Lucknow|         1|
+-------+----------+
only showing top 20 rows



In [31]:
df.select(F.last("City")).show()

+-----------------+
|last(City, false)|
+-----------------+
|          Lucknow|
+-----------------+

