In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
data_path = '/Users/danielsullivan/LinkedIn Learning/Spark SQL/data'

In [4]:
json_df2_path = data_path + "/utilization.json"
df_util = spark.read.format("json").load(json_df2_path)

In [5]:
df_util.createOrReplaceTempView("utilization")

In [6]:
spark.sql('SELECT server_id, min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) \
           FROM utilization \
           GROUP BY server_id').show()

+---------+--------------------+--------------------+----------------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|stddev_samp(cpu_utilization)|
+---------+--------------------+--------------------+----------------------------+
|      112|                0.52|                0.92|         0.11528867845082576|
|      113|                0.58|                0.98|         0.11544345150353694|
|      130|                0.35|                0.75|         0.11568834774246008|
|      126|                0.48|                0.88|         0.11542612970702051|
|      149|                0.54|                0.94|         0.11543517500295467|
|      110|                0.35|                0.75|         0.11533251724450215|
|      136|                0.41|                 0.8|         0.11597405743182258|
|      144|                0.47|                0.87|         0.11478654960489501|
|      119|                0.22|                0.62|         0.11516031929842008|
|   

In [7]:
sql_window = "SELECT event_datetime, server_id, cpu_utilization,  \
         avg(cpu_utilization) OVER (PARTITION BY server_id) avg_server_util \
FROM  \
      utilization"

In [8]:
spark.sql(sql_window).show()

+-------------------+---------+---------------+------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|
+-------------------+---------+---------------+------------------+
|03/05/2019 08:06:34|      112|           0.71|0.7153870000000067|
|03/05/2019 08:11:34|      112|           0.78|0.7153870000000067|
|03/05/2019 08:16:34|      112|           0.87|0.7153870000000067|
|03/05/2019 08:21:34|      112|           0.82|0.7153870000000067|
|03/05/2019 08:26:34|      112|           0.62|0.7153870000000067|
|03/05/2019 08:31:34|      112|            0.9|0.7153870000000067|
|03/05/2019 08:36:34|      112|           0.89|0.7153870000000067|
|03/05/2019 08:41:34|      112|           0.81|0.7153870000000067|
|03/05/2019 08:46:34|      112|           0.88|0.7153870000000067|
|03/05/2019 08:51:34|      112|           0.89|0.7153870000000067|
|03/05/2019 08:56:34|      112|           0.84|0.7153870000000067|
|03/05/2019 09:01:34|      112|           0.71|0.7153870000000

In [9]:
sql_window2 = "SELECT event_datetime, server_id, cpu_utilization,  \
         avg(cpu_utilization) OVER (PARTITION BY server_id) avg_server_util, \
         cpu_utilization - avg(cpu_utilization) OVER (PARTITION BY server_id) delta_server_util \
         FROM utilization"

In [10]:
spark.sql(sql_window2).show()

+-------------------+---------+---------------+------------------+--------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|   delta_server_util|
+-------------------+---------+---------------+------------------+--------------------+
|03/05/2019 08:06:34|      112|           0.71|0.7153870000000067|-0.00538700000000...|
|03/05/2019 08:11:34|      112|           0.78|0.7153870000000067| 0.06461299999999337|
|03/05/2019 08:16:34|      112|           0.87|0.7153870000000067| 0.15461299999999334|
|03/05/2019 08:21:34|      112|           0.82|0.7153870000000067|  0.1046129999999933|
|03/05/2019 08:26:34|      112|           0.62|0.7153870000000067|-0.09538700000000666|
|03/05/2019 08:31:34|      112|            0.9|0.7153870000000067| 0.18461299999999337|
|03/05/2019 08:36:34|      112|           0.89|0.7153870000000067| 0.17461299999999336|
|03/05/2019 08:41:34|      112|           0.81|0.7153870000000067|  0.0946129999999934|
|03/05/2019 08:46:34|      112| 

In [11]:
sql_window3 = "SELECT event_datetime, server_id, cpu_utilization,  \
                      avg(cpu_utilization) OVER (PARTITION BY server_id ORDER BY event_datetime \
                                    ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) avg_server_util \
                FROM  \
                      utilization"


In [12]:
spark.sql(sql_window3).show()

+-------------------+---------+---------------+------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|
+-------------------+---------+---------------+------------------+
|03/05/2019 08:06:34|      112|           0.71|             0.745|
|03/05/2019 08:11:34|      112|           0.78|0.7866666666666666|
|03/05/2019 08:16:34|      112|           0.87|0.8233333333333333|
|03/05/2019 08:21:34|      112|           0.82|              0.77|
|03/05/2019 08:26:34|      112|           0.62|0.7799999999999999|
|03/05/2019 08:31:34|      112|            0.9|0.8033333333333333|
|03/05/2019 08:36:34|      112|           0.89|0.8666666666666667|
|03/05/2019 08:41:34|      112|           0.81|              0.86|
|03/05/2019 08:46:34|      112|           0.88|              0.86|
|03/05/2019 08:51:34|      112|           0.89|              0.87|
|03/05/2019 08:56:34|      112|           0.84|0.8133333333333334|
|03/05/2019 09:01:34|      112|           0.71|0.7999999999999

In [13]:
(0.71+0.78)/2

0.745

In [14]:
(0.71+0.78+0.87)/3

0.7866666666666666