# Timeseries Analysis 

In [10]:
from pyspark.sql import SparkSession

In [11]:
spark = SparkSession.builder.getOrCreate()

In [12]:
data_path = '../Data'
file_path = data_path + '/utilization.csv'

In [13]:
df = spark.read.format('csv').options(header=False, inferSchema=True).load(file_path)

In [14]:
# rename columns
df = df.withColumnRenamed('_c0', 'event_datetime')\
    .withColumnRenamed('_c1', 'server_id')\
    .withColumnRenamed('_c2', 'cpu_utilization')\
    .withColumnRenamed('_c3', 'free_memory')\
    .withColumnRenamed('_c4', 'session_count')

In [16]:
df.createOrReplaceTempView('vw_utilization')

--------

# Timeseries Analysis 

In [19]:
# summary statistics

results = spark.sql('SELECT server_id, min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) \
                                FROM vw_utilization \
                                GROUP BY server_id')

In [20]:
results.show(10)

+---------+--------------------+--------------------+-----------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+---------+--------------------+--------------------+-----------------------+
|      148|                0.54|                0.94|    0.11451712518744131|
|      137|                0.54|                0.94|    0.11526245077758812|
|      133|                0.55|                0.95|    0.11534006553263144|
|      108|                0.55|                0.95|    0.11563100171171926|
|      101|                 0.6|                 1.0|    0.11651726263197697|
|      115|                0.44|                0.84|    0.11569664615015006|
|      126|                0.48|                0.88|    0.11542612970702051|
|      103|                0.56|                0.96|    0.11617507884178278|
|      128|                0.38|                0.78|     0.1153254132405078|
|      122|                0.43|                0.83|    0.11563

# Window function / Partition

### What is average server cpu utilization per server_id?

The following for average cpu utilization will be calculated over server_id partition.
basically it is another level of calculation for average.

In [21]:
results = spark.sql('SELECT event_datetime, server_id, cpu_utilization, \
                                avg(cpu_utilization) OVER (PARTITION BY server_id) AS avg_server_util \
                                FROM vw_utilization')

In [22]:
results.show()

+-------------------+---------+---------------+------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|
+-------------------+---------+---------------+------------------+
|03/05/2019 08:07:41|      148|           0.85|0.7393840000000045|
|03/05/2019 08:12:41|      148|           0.94|0.7393840000000045|
|03/05/2019 08:17:41|      148|           0.89|0.7393840000000045|
|03/05/2019 08:22:41|      148|           0.74|0.7393840000000045|
|03/05/2019 08:27:41|      148|           0.63|0.7393840000000045|
|03/05/2019 08:32:41|      148|           0.89|0.7393840000000045|
|03/05/2019 08:37:41|      148|           0.77|0.7393840000000045|
|03/05/2019 08:42:41|      148|           0.59|0.7393840000000045|
|03/05/2019 08:47:41|      148|           0.77|0.7393840000000045|
|03/05/2019 08:52:41|      148|           0.71|0.7393840000000045|
|03/05/2019 08:57:41|      148|           0.85|0.7393840000000045|
|03/05/2019 09:02:41|      148|           0.73|0.7393840000000

### What about the cpu utilization is over or under average server utilization?
- difference between average utilization and every utilization

In [24]:
results = spark.sql('SELECT event_datetime, server_id, cpu_utilization, \
                                avg(cpu_utilization) OVER (PARTITION BY server_id) AS avg_server_util, \
                                cpu_utilization - (avg(cpu_utilization) OVER (PARTITION BY server_id)) AS delta_server_util \
                                FROM vw_utilization')

In [26]:
results.show()

+-------------------+---------+---------------+------------------+--------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|   delta_server_util|
+-------------------+---------+---------------+------------------+--------------------+
|03/05/2019 08:07:41|      148|           0.85|0.7393840000000045|  0.1106159999999955|
|03/05/2019 08:12:41|      148|           0.94|0.7393840000000045| 0.20061599999999546|
|03/05/2019 08:17:41|      148|           0.89|0.7393840000000045| 0.15061599999999553|
|03/05/2019 08:22:41|      148|           0.74|0.7393840000000045| 6.15999999995509E-4|
|03/05/2019 08:27:41|      148|           0.63|0.7393840000000045|-0.10938400000000448|
|03/05/2019 08:32:41|      148|           0.89|0.7393840000000045| 0.15061599999999553|
|03/05/2019 08:37:41|      148|           0.77|0.7393840000000045|0.030615999999995536|
|03/05/2019 08:42:41|      148|           0.59|0.7393840000000045| -0.1493840000000045|
|03/05/2019 08:47:41|      148| 

# Sliding Windows
- calculating the nearest neighbors average ( this can be average of last 3 values and average of next 3 values)

**In our example: we will use each 1 rows (before and after) of current row. So we need to order by datetime to make sure those events are in sequentital order**

In [39]:
results = spark.sql('SELECT event_datetime, server_id, cpu_utilization, \
                                avg(cpu_utilization) OVER (PARTITION BY server_id ORDER BY event_datetime \
                                ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS avg_server_util \
                                FROM vw_utilization')

In [40]:
results.show()

+-------------------+---------+---------------+------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|
+-------------------+---------+---------------+------------------+
|03/05/2019 08:07:41|      148|           0.85|             0.895|
|03/05/2019 08:12:41|      148|           0.94|0.8933333333333334|
|03/05/2019 08:17:41|      148|           0.89|0.8566666666666668|
|03/05/2019 08:22:41|      148|           0.74|0.7533333333333333|
|03/05/2019 08:27:41|      148|           0.63|0.7533333333333334|
|03/05/2019 08:32:41|      148|           0.89|0.7633333333333333|
|03/05/2019 08:37:41|      148|           0.77|              0.75|
|03/05/2019 08:42:41|      148|           0.59|              0.71|
|03/05/2019 08:47:41|      148|           0.77|              0.69|
|03/05/2019 08:52:41|      148|           0.71|0.7766666666666667|
|03/05/2019 08:57:41|      148|           0.85|0.7633333333333333|
|03/05/2019 09:02:41|      148|           0.73|0.8233333333333

In [42]:
# for the 2nd row, the sliding average will be calculated as following
# get value from 1 row above, current row value, 1 row below | then sum it up and divide by 3

(0.85 + 0.94 + 0.89) / 3

0.8933333333333334