In [83]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql import Row
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType

In [2]:
spark = (SparkSession.builder
                  .appName("Spark SQL Query Dataframes")
                  .getOrCreate())

In [3]:
data_path = './data'

In [4]:
file_path = f'{data_path}/utilization.json'

In [5]:
sdf = (spark.read
        .format("json")
        .load(file_path))

In [6]:
sdf.show(3)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.64|03/28/2019 08:16:52|       0.38|      122|           72|
|           0.66|03/28/2019 08:21:52|       0.22|      122|           69|
|           0.55|03/28/2019 08:26:52|       0.52|      122|           80|
+---------------+-------------------+-----------+---------+-------------+
only showing top 3 rows



In [7]:
sdf.printSchema()

root
 |-- cpu_utilization: double (nullable = true)
 |-- event_datetime: string (nullable = true)
 |-- free_memory: double (nullable = true)
 |-- server_id: long (nullable = true)
 |-- session_count: long (nullable = true)



In [8]:
sdf.count()

500000

In [9]:
sdf.createOrReplaceTempView("utilization")

In [17]:
sdf_sql = spark.sql("SELECT * FROM utilization LIMIT 3")

In [18]:
sdf_sql.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.64|03/28/2019 08:16:52|       0.38|      122|           72|
|           0.66|03/28/2019 08:21:52|       0.22|      122|           69|
|           0.55|03/28/2019 08:26:52|       0.52|      122|           80|
+---------------+-------------------+-----------+---------+-------------+



In [12]:
sdf_sql.count()

10

In [16]:
sdf_sql = spark.sql("SELECT server_id, session_count FROM utilization LIMIT 3")
sdf_sql.show()

+---------+-------------+
|server_id|session_count|
+---------+-------------+
|      122|           72|
|      122|           69|
|      122|           80|
+---------+-------------+



In [15]:
sdf_sql = spark.sql("SELECT server_id as sid, session_count as sc FROM utilization")
sdf_sql.show(3)

+---+---+
|sid| sc|
+---+---+
|122| 72|
|122| 69|
|122| 80|
+---+---+
only showing top 3 rows



In [20]:
sdf_sql = spark.sql("SELECT * FROM utilization WHERE server_id = 120")
sdf_sql.show(3)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.66|03/05/2019 08:06:48|       0.31|      120|           54|
|           0.58|03/05/2019 08:11:48|       0.38|      120|           64|
|           0.55|03/05/2019 08:16:48|       0.61|      120|           54|
+---------------+-------------------+-----------+---------+-------------+
only showing top 3 rows



In [22]:
sdf_sql = spark.sql("SELECT server_id, session_count FROM utilization WHERE session_count > 70")
sdf_sql.show(3)

+---------+-------------+
|server_id|session_count|
+---------+-------------+
|      122|           72|
|      122|           80|
|      122|           77|
+---------+-------------+
only showing top 3 rows



In [23]:
sdf_sql = spark.sql("SELECT server_id, session_count FROM utilization WHERE session_count > 70 AND server_id = 120")
sdf_sql.show(3)

+---------+-------------+
|server_id|session_count|
+---------+-------------+
|      120|           80|
|      120|           71|
|      120|           73|
+---------+-------------+
only showing top 3 rows



In [26]:
sdf_sql = spark.sql("""
                   SELECT server_id, session_count
                   FROM utilization
                   WHERE session_count > 70 AND server_id = 120
                   ORDER BY session_count DESC
                 """)
sdf_sql.show(3)

+---------+-------------+
|server_id|session_count|
+---------+-------------+
|      120|           80|
|      120|           80|
|      120|           80|
+---------+-------------+
only showing top 3 rows



In [27]:
sdf_sql = spark.sql("SELECT count(*) FROM utilization")
sdf_sql.show()

+--------+
|count(1)|
+--------+
|  500000|
+--------+



In [28]:
sdf_sql = spark.sql("SELECT count(*) \
                    FROM utilization \
                    WHERE session_count > 70")
sdf_sql.show()

+--------+
|count(1)|
+--------+
|  239659|
+--------+



In [30]:
sdf_sql = spark.sql("""SELECT server_id, count(*)
                    FROM utilization
                    WHERE session_count > 70
                    GROUP BY server_id""")
sdf_sql.show(3)

+---------+--------+
|server_id|count(1)|
+---------+--------+
|      112|    7425|
|      113|    9418|
|      130|    2891|
+---------+--------+
only showing top 3 rows



In [32]:
sdf_sql = spark.sql("""
                    SELECT server_id, count(*)
                    FROM utilization
                    WHERE session_count > 70
                    GROUP BY server_id
                    ORDER BY count(*) DESC
                    """)
sdf_sql.show(3)

+---------+--------+
|server_id|count(1)|
+---------+--------+
|      101|    9808|
|      113|    9418|
|      145|    9304|
+---------+--------+
only showing top 3 rows



In [33]:
sdf_sql = spark.sql("""
                    SELECT server_id, min(session_count), avg(session_count), max(session_count)
                    FROM utilization
                    WHERE session_count > 70
                    GROUP BY server_id
                    ORDER BY count(*) DESC
                """)
sdf_sql.show(3)

+---------+------------------+------------------+------------------+
|server_id|min(session_count)|avg(session_count)|max(session_count)|
+---------+------------------+------------------+------------------+
|      101|                71| 87.66557911908646|               105|
|      113|                71| 86.96262476109577|               103|
|      145|                71| 86.97732158211522|               103|
+---------+------------------+------------------+------------------+
only showing top 3 rows



In [34]:
sdf_sql = spark.sql("""
                    SELECT server_id, min(session_count), round(avg(session_count),2), max(session_count)
                    FROM utilization
                    WHERE session_count > 70
                    GROUP BY server_id
                    ORDER BY count(*) DESC
                    """)
sdf_sql.show(3)

+---------+------------------+----------------------------+------------------+
|server_id|min(session_count)|round(avg(session_count), 2)|max(session_count)|
+---------+------------------+----------------------------+------------------+
|      101|                71|                       87.67|               105|
|      113|                71|                       86.96|               103|
|      145|                71|                       86.98|               103|
+---------+------------------+----------------------------+------------------+
only showing top 3 rows



In [41]:
file_path = f'{data_path}/utilization.json'

In [42]:
df_util = (spark.read
               .format("json")
               .load(file_path))

In [47]:
df_util.createOrReplaceTempView("utilization")

In [46]:
df_util.show(3)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.64|03/28/2019 08:16:52|       0.38|      122|           72|
|           0.66|03/28/2019 08:21:52|       0.22|      122|           69|
|           0.55|03/28/2019 08:26:52|       0.52|      122|           80|
+---------------+-------------------+-----------+---------+-------------+
only showing top 3 rows



In [43]:
file_path = f'{data_path}/server_name.csv'

In [44]:
df_server = (spark.read
       .format("csv")
       .option("header", "true")
       .load(file_path))

In [45]:
df_server.show(3)

+---------+-----------+
|server_id|server_name|
+---------+-----------+
|      100| 100 Server|
|      101| 101 Server|
|      102| 102 Server|
+---------+-----------+
only showing top 3 rows



In [48]:
df_server.createOrReplaceTempView("server_name")

In [49]:
df_count = spark.sql("SELECT DISTINCT server_id FROM utilization ORDER BY server_id")
df_count.show(3)

+---------+
|server_id|
+---------+
|      100|
|      101|
|      102|
+---------+
only showing top 3 rows



In [50]:
spark.sql("SELECT min(server_id), max(server_id) FROM utilization").show()

+--------------+--------------+
|min(server_id)|max(server_id)|
+--------------+--------------+
|           100|           149|
+--------------+--------------+



In [52]:
spark.sql("SELECT * FROM server_name").show(3)

+---------+-----------+
|server_id|server_name|
+---------+-----------+
|      100| 100 Server|
|      101| 101 Server|
|      102| 102 Server|
+---------+-----------+
only showing top 3 rows



In [53]:
sdf_join = spark.sql("""
                         SELECT u.server_id, sn.server_name, u.session_count
                         FROM utilization AS u
                         INNER JOIN server_name AS sn
                         ON sn.server_id = u.server_id
                    """)
sdf_join.show(3)   

+---------+-----------+-------------+
|server_id|server_name|session_count|
+---------+-----------+-------------+
|      122| 122 Server|           72|
|      122| 122 Server|           69|
|      122| 122 Server|           80|
+---------+-----------+-------------+
only showing top 3 rows



In [63]:
df_dup = spark.sparkContext.parallelize([
                             Row(server_name='101 Server', cpu_utilization=85, session_count=80),
                             Row(server_name='101 Server', cpu_utilization=80, session_count=90),
                             Row(server_name='102 Server', cpu_utilization=85, session_count=80),
                             Row(server_name='102 Server', cpu_utilization=85, session_count=80)
                        ]).toDF()

In [65]:
df_dup.show()

+-----------+---------------+-------------+
|server_name|cpu_utilization|session_count|
+-----------+---------------+-------------+
| 101 Server|             85|           80|
| 101 Server|             80|           90|
| 102 Server|             85|           80|
| 102 Server|             85|           80|
+-----------+---------------+-------------+



In [66]:
df_dup.drop_duplicates().show()

+-----------+---------------+-------------+
|server_name|cpu_utilization|session_count|
+-----------+---------------+-------------+
| 101 Server|             85|           80|
| 101 Server|             80|           90|
| 102 Server|             85|           80|
+-----------+---------------+-------------+



In [67]:
df_dup.drop_duplicates(['server_name']).show()

+-----------+---------------+-------------+
|server_name|cpu_utilization|session_count|
+-----------+---------------+-------------+
| 102 Server|             85|           80|
| 101 Server|             85|           80|
+-----------+---------------+-------------+



In [68]:
df_util.describe().show()

+-------+-------------------+-------------------+-------------------+------------------+------------------+
|summary|    cpu_utilization|     event_datetime|        free_memory|         server_id|     session_count|
+-------+-------------------+-------------------+-------------------+------------------+------------------+
|  count|             500000|             500000|             500000|            500000|            500000|
|   mean| 0.6205177400000055|               null| 0.3791280999999993|             124.5|          69.59616|
| stddev|0.15875173872912818|               null|0.15830931278376192|14.430884120553516|14.850676696352838|
|    min|               0.22|03/05/2019 08:06:14|                0.0|               100|                32|
|    max|                1.0|04/09/2019 01:22:46|               0.78|               149|               105|
+-------+-------------------+-------------------+-------------------+------------------+------------------+



In [69]:
df_util.describe('cpu_utilization','free_memory').show()

+-------+-------------------+-------------------+
|summary|    cpu_utilization|        free_memory|
+-------+-------------------+-------------------+
|  count|             500000|             500000|
|   mean| 0.6205177400000055| 0.3791280999999993|
| stddev|0.15875173872912818|0.15830931278376192|
|    min|               0.22|                0.0|
|    max|                1.0|               0.78|
+-------+-------------------+-------------------+



In [70]:
df_util.stat.corr('cpu_utilization','free_memory')

-0.4704771573080726

In [71]:
df_util.stat.freqItems(('server_id','session_count')).show()

+--------------------+-----------------------+
| server_id_freqItems|session_count_freqItems|
+--------------------+-----------------------+
|[137, 146, 101, 1...|   [92, 101, 83, 104...|
+--------------------+-----------------------+



In [72]:
spark.sql('SELECT min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) FROM utilization').show()

+--------------------+--------------------+-----------------------+
|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+--------------------+--------------------+-----------------------+
|                0.22|                 1.0|    0.15875173872912818|
+--------------------+--------------------+-----------------------+



In [74]:
spark.sql("""
            SELECT server_id, min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization)
            FROM utilization
            GROUP BY server_id
          """).show(3)

+---------+--------------------+--------------------+-----------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+---------+--------------------+--------------------+-----------------------+
|      112|                0.52|                0.92|    0.11528867845082576|
|      113|                0.58|                0.98|    0.11544345150353694|
|      130|                0.35|                0.75|    0.11568834774245991|
+---------+--------------------+--------------------+-----------------------+
only showing top 3 rows



In [76]:
spark.sql('SELECT server_id, FLOOR(cpu_utilization*100/10) AS bucket FROM utilization').show(3)

+---------+------+
|server_id|bucket|
+---------+------+
|      122|     6|
|      122|     6|
|      122|     5|
+---------+------+
only showing top 3 rows



In [77]:
sql_window = """
                SELECT 
                event_datetime, 
                server_id, 
                cpu_utilization,
                avg(cpu_utilization) OVER (PARTITION BY server_id) AS avg_server_util
                FROM utilization
            """

In [79]:
spark.sql(sql_window).show(3)

+-------------------+---------+---------------+------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|
+-------------------+---------+---------------+------------------+
|03/05/2019 08:06:34|      112|           0.71|0.7153870000000067|
|03/05/2019 08:11:34|      112|           0.78|0.7153870000000067|
|03/05/2019 08:16:34|      112|           0.87|0.7153870000000067|
+-------------------+---------+---------------+------------------+
only showing top 3 rows



In [80]:
sql_window = """
                SELECT 
                event_datetime, 
                server_id, 
                cpu_utilization,
                avg(cpu_utilization) OVER (PARTITION BY server_id) AS avg_server_util,
                cpu_utilization - avg(cpu_utilization) OVER (PARTITION BY server_id) AS delta_server_util
                FROM utilization
            """
spark.sql(sql_window).show(3)

+-------------------+---------+---------------+------------------+--------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|   delta_server_util|
+-------------------+---------+---------------+------------------+--------------------+
|03/05/2019 08:06:34|      112|           0.71|0.7153870000000067|-0.00538700000000...|
|03/05/2019 08:11:34|      112|           0.78|0.7153870000000067| 0.06461299999999337|
|03/05/2019 08:16:34|      112|           0.87|0.7153870000000067| 0.15461299999999334|
+-------------------+---------+---------------+------------------+--------------------+
only showing top 3 rows



In [81]:
sql_window = """
                SELECT 
                event_datetime, 
                server_id, 
                cpu_utilization,
                avg(cpu_utilization) OVER(
                                            PARTITION BY server_id 
                                            ORDER BY event_datetime
                                            ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
                                        ) AS avg_server_util
                FROM utilization
            """
spark.sql(sql_window).show(3)

+-------------------+---------+---------------+------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|
+-------------------+---------+---------------+------------------+
|03/05/2019 08:06:34|      112|           0.71|             0.745|
|03/05/2019 08:11:34|      112|           0.78|0.7866666666666666|
|03/05/2019 08:16:34|      112|           0.87|0.8233333333333333|
+-------------------+---------+---------------+------------------+
only showing top 3 rows



In [85]:
sdf = spark.sparkContext.parallelize(
                            [Row(server_name='101 Server', cpu_utilization=85, session_count=80),
                             Row(server_name='101 Server', cpu_utilization=80, session_count=90),
                             Row(server_name='102 Server', cpu_utilization=85, session_count=40),
                             Row(server_name='103 Server', cpu_utilization=70, session_count=80),
                             Row(server_name='104 Server', cpu_utilization=60, session_count=80)]
                            ).toDF()

In [86]:
df.show()

+-----------+---------------+-------------+
|server_name|cpu_utilization|session_count|
+-----------+---------------+-------------+
| 101 Server|             85|           80|
| 101 Server|             80|           90|
| 102 Server|             85|           40|
| 103 Server|             70|           80|
| 104 Server|             60|           80|
+-----------+---------------+-------------+



In [87]:
df_na = df.withColumn('na_col', lit(None).cast(StringType()))

In [90]:
df_na.show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|  null|
| 101 Server|             80|           90|  null|
| 102 Server|             85|           40|  null|
| 103 Server|             70|           80|  null|
| 104 Server|             60|           80|  null|
+-----------+---------------+-------------+------+



In [89]:
df_na.fillna('A').show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|     A|
| 101 Server|             80|           90|     A|
| 102 Server|             85|           40|     A|
| 103 Server|             70|           80|     A|
| 104 Server|             60|           80|     A|
+-----------+---------------+-------------+------+



In [91]:
df2 = df_na.fillna('A').union(df_na)

In [92]:
df2.show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|     A|
| 101 Server|             80|           90|     A|
| 102 Server|             85|           40|     A|
| 103 Server|             70|           80|     A|
| 104 Server|             60|           80|     A|
| 101 Server|             85|           80|  null|
| 101 Server|             80|           90|  null|
| 102 Server|             85|           40|  null|
| 103 Server|             70|           80|  null|
| 104 Server|             60|           80|  null|
+-----------+---------------+-------------+------+



In [93]:
df2.na.drop().show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|     A|
| 101 Server|             80|           90|     A|
| 102 Server|             85|           40|     A|
| 103 Server|             70|           80|     A|
| 104 Server|             60|           80|     A|
+-----------+---------------+-------------+------+



In [94]:
df2.createOrReplaceTempView("na_table")

In [95]:
spark.sql("SELECT * FROM na_table").show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|     A|
| 101 Server|             80|           90|     A|
| 102 Server|             85|           40|     A|
| 103 Server|             70|           80|     A|
| 104 Server|             60|           80|     A|
| 101 Server|             85|           80|  null|
| 101 Server|             80|           90|  null|
| 102 Server|             85|           40|  null|
| 103 Server|             70|           80|  null|
| 104 Server|             60|           80|  null|
+-----------+---------------+-------------+------+



In [96]:
spark.sql("SELECT * FROM na_table WHERE na_col IS NULL").show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|  null|
| 101 Server|             80|           90|  null|
| 102 Server|             85|           40|  null|
| 103 Server|             70|           80|  null|
| 104 Server|             60|           80|  null|
+-----------+---------------+-------------+------+



In [97]:
spark.sql("SELECT * FROM na_table WHERE na_col IS NOT NULL").show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|     A|
| 101 Server|             80|           90|     A|
| 102 Server|             85|           40|     A|
| 103 Server|             70|           80|     A|
| 104 Server|             60|           80|     A|
+-----------+---------------+-------------+------+

