# Aggregating using Dataframe API

In [46]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [47]:
data_path = '../Data'
file_path = data_path + '/location_temp.csv'

In [48]:
df = spark.read.format('csv').options(header=True).load(file_path)

In [49]:
df.show(5)

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
|03/04/2019 19:58:06|       loc0|          28|
|03/04/2019 20:03:06|       loc0|          30|
|03/04/2019 20:08:06|       loc0|          27|
+-------------------+-----------+------------+
only showing top 5 rows



# GroupBy

We want to know how many measurements we have for each location.

In [50]:
df.groupBy('location_id').count().show()

+-----------+-----+
|location_id|count|
+-----------+-----+
|     loc196| 1000|
|     loc226| 1000|
|     loc463| 1000|
|     loc150| 1000|
|     loc292| 1000|
|     loc311| 1000|
|      loc22| 1000|
|     loc351| 1000|
|     loc370| 1000|
|     loc419| 1000|
|      loc31| 1000|
|     loc305| 1000|
|      loc82| 1000|
|      loc90| 1000|
|     loc118| 1000|
|     loc195| 1000|
|     loc208| 1000|
|      loc39| 1000|
|      loc75| 1000|
|     loc228| 1000|
+-----------+-----+
only showing top 20 rows



# Order By

In [51]:
df.orderBy('location_id').show()

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 21:23:06|       loc0|          28|
|03/04/2019 20:43:06|       loc0|          28|
|03/04/2019 21:18:06|       loc0|          33|
|03/04/2019 20:18:06|       loc0|          27|
|03/04/2019 20:38:06|       loc0|          32|
|03/04/2019 20:58:06|       loc0|          34|
|03/04/2019 21:13:06|       loc0|          28|
|03/04/2019 19:58:06|       loc0|          28|
|03/04/2019 20:13:06|       loc0|          27|
|03/04/2019 20:28:06|       loc0|          32|
|03/04/2019 20:33:06|       loc0|          35|
|03/04/2019 20:48:06|       loc0|          28|
|03/04/2019 20:53:06|       loc0|          32|
|03/04/2019 21:03:06|       loc0|          33|
|03/04/2019 21:08:06|       loc0|          27|
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
|03/04/2019 20:03:06|       loc0|          30|
|03/04/2019 2

# GroupBy + OrderBy

We want to know how many measurements we have for each location, sorted by Location ID

In [52]:
df.groupBy('location_id').count().orderBy('location_id', ascending=True).show()

+-----------+-----+
|location_id|count|
+-----------+-----+
|       loc0| 1000|
|       loc1| 1000|
|      loc10| 1000|
|     loc100| 1000|
|     loc101| 1000|
|     loc102| 1000|
|     loc103| 1000|
|     loc104| 1000|
|     loc105| 1000|
|     loc106| 1000|
|     loc107| 1000|
|     loc108| 1000|
|     loc109| 1000|
|      loc11| 1000|
|     loc110| 1000|
|     loc111| 1000|
|     loc112| 1000|
|     loc113| 1000|
|     loc114| 1000|
|     loc115| 1000|
+-----------+-----+
only showing top 20 rows



# Aggreation / Aggreate function

- count() 
- min()
- max()
- sum()
- mean()
- avg()
- agg() : Using this, we can calculate more than one aggregate at a time.
- pivot() 


We want to know average temparature of each location.

In [62]:
df.groupBy('location_id').agg({'temp_celcius': 'mean'}).show()

+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|     loc196|           29.225|
|     loc226|           25.306|
|     loc463|           23.317|
|     loc150|           32.188|
|     loc292|           29.159|
|     loc311|           24.308|
|      loc22|           28.251|
|     loc351|           28.194|
|     loc370|            29.14|
|     loc419|           29.141|
|      loc31|           25.196|
|     loc305|           27.314|
|      loc82|           27.355|
|      loc90|           23.216|
|     loc118|           24.219|
|     loc195|            27.25|
|     loc208|           26.206|
|      loc39|           25.199|
|      loc75|           23.209|
|     loc228|           27.295|
+-----------+-----------------+
only showing top 20 rows



In [54]:
df.groupBy('location_id').agg({'temp_celcius': 'avg'}).show()

+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|     loc196|           29.225|
|     loc226|           25.306|
|     loc463|           23.317|
|     loc150|           32.188|
|     loc292|           29.159|
|     loc311|           24.308|
|      loc22|           28.251|
|     loc351|           28.194|
|     loc370|            29.14|
|     loc419|           29.141|
|      loc31|           25.196|
|     loc305|           27.314|
|      loc82|           27.355|
|      loc90|           23.216|
|     loc118|           24.219|
|     loc195|            27.25|
|     loc208|           26.206|
|      loc39|           25.199|
|      loc75|           23.209|
|     loc228|           27.295|
+-----------+-----------------+
only showing top 20 rows



In [55]:
df.groupBy('location_id').agg({'temp_celcius': 'mean'}).orderBy('location_id', ascending=True).show()

+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|       loc0|           29.176|
|       loc1|           28.246|
|      loc10|           25.337|
|     loc100|           27.297|
|     loc101|           25.317|
|     loc102|           30.327|
|     loc103|           25.341|
|     loc104|           26.204|
|     loc105|           26.217|
|     loc106|           27.201|
|     loc107|           33.268|
|     loc108|           32.195|
|     loc109|           24.138|
|      loc11|           25.308|
|     loc110|           26.239|
|     loc111|           31.391|
|     loc112|           33.359|
|     loc113|           30.345|
|     loc114|           29.261|
|     loc115|           23.239|
+-----------+-----------------+
only showing top 20 rows



What is the max temperature for each location?

In [56]:
df.groupBy('location_id').agg({'temp_celcius': 'max'}).show()

+-----------+-----------------+
|location_id|max(temp_celcius)|
+-----------+-----------------+
|     loc196|               36|
|     loc226|               32|
|     loc463|               30|
|     loc150|               39|
|     loc292|               36|
|     loc311|               31|
|      loc22|               35|
|     loc351|               35|
|     loc370|               36|
|     loc419|               36|
|     loc305|               34|
|      loc31|               32|
|     loc118|               31|
|     loc195|               34|
|     loc208|               33|
|      loc82|               34|
|      loc90|               30|
|     loc228|               34|
|      loc39|               32|
|      loc75|               30|
+-----------+-----------------+
only showing top 20 rows

