# Querying Dataframes with SQL

In previous module, we were using Spark API to manipulate the data. In this module we will use SQL.

In [1]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('Spark SQL Query DataFrames').getOrCreate()

In [5]:
spark

-------

In [10]:
data_path = '../Data'
file_path = data_path + '/utilization.json'

In [11]:
df = spark.read.format('json').load(file_path)

In [14]:
df.show(5)

+----+-------------------+-----------+---------+-------------+
| cpu|     event_datetime|free_memory|server_id|session_count|
+----+-------------------+-----------+---------+-------------+
|0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|0.35|03/05/2019 08:26:14|       0.46|      100|           43|
+----+-------------------+-----------+---------+-------------+
only showing top 5 rows



In [15]:
df.count()

10

# Working with SQL in Spark
- 1) to workg with SQL in spark, we need to create temporary view first.
- 2) then we can use sql statement to make query.

In [20]:
# create temp view named 'utilization'

df.createOrReplaceTempView('vw_utilization')

In [25]:
# now we can query from that created temporary view

results = spark.sql('SELECT server_id, session_count FROM vw_utilization LIMIT 5;')

In [26]:
results.show()

+---------+-------------+
|server_id|session_count|
+---------+-------------+
|      100|           47|
|      100|           43|
|      100|           62|
|      100|           50|
|      100|           43|
+---------+-------------+



In [27]:
results = spark.sql('SELECT server_id AS sid, session_count AS sc FROM vw_utilization LIMIT 5;')

In [28]:
results.show()

+---+---+
|sid| sc|
+---+---+
|100| 47|
|100| 43|
|100| 62|
|100| 50|
|100| 43|
+---+---+

