# Aggregating Dataframes with SQL

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Spark SQL Query DataFrames').getOrCreate()

---

In [3]:
data_path = '../Data'
file_path = data_path + '/utilization.json'

In [4]:
df = spark.read.format('json').load(file_path)

In [6]:
df.show(5)

+----+-------------------+-----------+---------+-------------+
| cpu|     event_datetime|free_memory|server_id|session_count|
+----+-------------------+-----------+---------+-------------+
|0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|0.35|03/05/2019 08:26:14|       0.46|      100|           43|
+----+-------------------+-----------+---------+-------------+
only showing top 5 rows



-------


In [7]:
df.createOrReplaceTempView('vw_utilization')

In [13]:
count = spark.sql('SELECT count(*) AS total_count FROM vw_utilization')

In [17]:
count.show()

+-----------+
|total_count|
+-----------+
|         10|
+-----------+



In [20]:
results = spark.sql('SELECT server_id, COUNT(*) AS total \
                                FROM vw_utilization \
                                GROUP BY server_id')

In [21]:
results.show()

+---------+-----+
|server_id|total|
+---------+-----+
|      100|    5|
|      200|    5|
+---------+-----+



In [22]:
results = spark.sql('SELECT server_id, COUNT(*) AS total \
                                FROM vw_utilization \
                                WHERE session_count > 50 \
                                GROUP BY server_id')

In [23]:
results.show()

+---------+-----+
|server_id|total|
+---------+-----+
|      100|    1|
|      200|    1|
+---------+-----+



In [26]:
# min, max session count of each server id
results = spark.sql('SELECT server_id, min(session_count) AS min_session_count, max(session_count), avg(session_count) AS max_session_count \
                                FROM vw_utilization \
                                GROUP BY server_id')

In [29]:
results.show()

+---------+-----------------+------------------+-----------------+
|server_id|min_session_count|max(session_count)|max_session_count|
+---------+-----------------+------------------+-----------------+
|      100|               43|                62|             49.0|
|      200|               43|                62|             49.0|
+---------+-----------------+------------------+-----------------+

