In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

In [2]:
spark = SparkSession \
    .builder \
    .appName("Spark SQL Query Dataframes") \
    .getOrCreate()

In [3]:
data_path = '/Users/danielsullivan/LinkedIn Learning/Spark SQL/data'

In [4]:
json_df_path = data_path + "/utilization.json"
df_util = spark.read.format("json").load(json_df_path)

In [5]:
df_util.show(10)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.77|03/16/2019 17:21:40|       0.22|      115|           58|
|           0.53|03/16/2019 17:26:40|       0.23|      115|           64|
|            0.6|03/16/2019 17:31:40|       0.19|      115|           82|
|           0.46|03/16/2019 17:36:40|       0.32|      115|           60|
|           0.77|03/16/2019 17:41:40|       0.49|      115|           84|
|           0.62|03/16/2019 17:46:40|       0.31|      115|           73|
|           0.71|03/16/2019 17:51:40|       0.54|      115|           67|
|           0.67|03/16/2019 17:56:40|       0.54|      115|           83|
|           0.72|03/16/2019 18:01:40|       0.26|      115|           68|
|           0.62|03/16/2019 18:06:40|       0.52|      115|           60|
+---------------+-------------------+-

In [6]:
df_util.createOrReplaceTempView("utilization")

In [7]:
csv_df_path = data_path + "/server_name.csv"
df_server = spark.read.csv(csv_df_path,header=True)

In [8]:
df_server.show()

+---------+-----------+
|server_id|server_name|
+---------+-----------+
|      100| 100 Server|
|      101| 101 Server|
|      102| 102 Server|
|      103| 103 Server|
|      104| 104 Server|
|      105| 105 Server|
|      106| 106 Server|
|      107| 107 Server|
|      108| 108 Server|
|      109| 109 Server|
|      110| 110 Server|
|      111| 111 Server|
|      112| 112 Server|
|      113| 113 Server|
|      114| 114 Server|
|      115| 115 Server|
|      116| 116 Server|
|      117| 117 Server|
|      118| 118 Server|
|      119| 119 Server|
+---------+-----------+
only showing top 20 rows



In [9]:
df_server.createOrReplaceTempView("server_name")

In [10]:
df_count = spark.sql("SELECT DISTINCT server_id FROM utilization ORDER BY server_id")
df_count.show()

+---------+
|server_id|
+---------+
|      100|
|      101|
|      102|
|      103|
|      104|
|      105|
|      106|
|      107|
|      108|
|      109|
|      110|
|      111|
|      112|
|      113|
|      114|
|      115|
|      116|
|      117|
|      118|
|      119|
+---------+
only showing top 20 rows



In [11]:
spark.sql("SELECT min(server_id), max(server_id) FROM utilization").show()

+--------------+--------------+
|min(server_id)|max(server_id)|
+--------------+--------------+
|           100|           149|
+--------------+--------------+



In [12]:
spark.sql("SELECT * FROM server_name").show()

+---------+-----------+
|server_id|server_name|
+---------+-----------+
|      100| 100 Server|
|      101| 101 Server|
|      102| 102 Server|
|      103| 103 Server|
|      104| 104 Server|
|      105| 105 Server|
|      106| 106 Server|
|      107| 107 Server|
|      108| 108 Server|
|      109| 109 Server|
|      110| 110 Server|
|      111| 111 Server|
|      112| 112 Server|
|      113| 113 Server|
|      114| 114 Server|
|      115| 115 Server|
|      116| 116 Server|
|      117| 117 Server|
|      118| 118 Server|
|      119| 119 Server|
+---------+-----------+
only showing top 20 rows



In [13]:
df_join = spark.sql("SELECT u.server_id, sn.server_name, u.session_count \
                     FROM utilization u \
                     INNER JOIN server_name sn \
                     ON sn.server_id = u.server_id")
df_join.show()   

+---------+-----------+-------------+
|server_id|server_name|session_count|
+---------+-----------+-------------+
|      115| 115 Server|           58|
|      115| 115 Server|           64|
|      115| 115 Server|           82|
|      115| 115 Server|           60|
|      115| 115 Server|           84|
|      115| 115 Server|           73|
|      115| 115 Server|           67|
|      115| 115 Server|           83|
|      115| 115 Server|           68|
|      115| 115 Server|           60|
|      115| 115 Server|           60|
|      115| 115 Server|           62|
|      115| 115 Server|           78|
|      115| 115 Server|           66|
|      115| 115 Server|           89|
|      115| 115 Server|           76|
|      115| 115 Server|           87|
|      115| 115 Server|           62|
|      115| 115 Server|           67|
|      115| 115 Server|           58|
+---------+-----------+-------------+
only showing top 20 rows

