# C10 
# Spark SQL 

In [3]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("c10").getOrCreate()
spark.sparkContext.getConf().getAll()


[('spark.app.id', 'local-1597293540960'),
 ('spark.rdd.compress', 'True'),
 ('spark.app.name', 'c10'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', 'informix-test-01.phs.local'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.port', '33922')]

In [24]:
# Create Table 
# /root/golive/Spark-The-Definitive-Guide/data/flight-data/json

spark.sql("""DROP TABLE IF Exists flights""")

create_json_table = """
CREATE TABLE flights ( DEST_COUNTRY_NAME STRING, 
                       ORIGIN_COUNTRY_NAME STRING, 
                       count LONG) 
USING JSON OPTIONS (path '/root/golive/Spark-The-Definitive-Guide/data/flight-data/json')"""

spark.sql(create_json_table)

spark.sql("select * from flights").show(2)

spark.sql("show tables").show()


+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|  flights|      false|
+--------+---------+-----------+



In [28]:
# Column with Comments 

create_csv_table="""CREATE TABLE flights_csv ( DEST_COUNTRY_NAME STRING, 
                          ORIGIN_COUNTRY_NAME STRING COMMENT "remember, the US will be most prevalent", 
                          count LONG) 
USING csv OPTIONS (header true, path '/root/golive/Spark-The-Definitive-Guide/data/flight-data/csv/2015-summary.csv')"""

spark.sql("""DROP TABLE IF Exists flights_csv""")


spark.sql(create_csv_table)

spark.sql("select * from flights_csv").show(2)

spark.sql("show tables").show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+--------+-----------+-----------+
|database|  tableName|isTemporary|
+--------+-----------+-----------+
| default|    flights|      false|
| default|flights_csv|      false|
+--------+-----------+-----------+



In [31]:
create_parquet = """
CREATE TABLE  IF NOT Exists partitioned_flights
USING parquet 
PARTITIONED BY (DEST_COUNTRY_NAME) 
AS SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count FROM flights LIMIT 5"""

spark.sql(create_parquet)

spark.sql("select * from partitioned_flights").show(2)

spark.sql("show tables").show()



+-------------------+-----+-----------------+
|ORIGIN_COUNTRY_NAME|count|DEST_COUNTRY_NAME|
+-------------------+-----+-----------------+
|      United States|   15|            Egypt|
|            Romania|   15|    United States|
+-------------------+-----+-----------------+
only showing top 2 rows

+--------+-------------------+-----------+
|database|          tableName|isTemporary|
+--------+-------------------+-----------+
| default|            flights|      false|
| default|        flights_csv|      false|
| default|partitioned_flights|      false|
+--------+-------------------+-----------+



In [34]:
# Describe Table and show partition 
spark.sql("desc table partitioned_flights ").show()

spark.sql("show partitions partitioned_flights ").show(2,False)

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
| ORIGIN_COUNTRY_NAME|   string|   null|
|               count|   bigint|   null|
|   DEST_COUNTRY_NAME|   string|   null|
|# Partition Infor...|         |       |
|          # col_name|data_type|comment|
|   DEST_COUNTRY_NAME|   string|   null|
+--------------------+---------+-------+

+-------------------------------+
|partition                      |
+-------------------------------+
|DEST_COUNTRY_NAME=Egypt        |
|DEST_COUNTRY_NAME=United States|
+-------------------------------+



In [37]:
# Manage Partitions 
# REFRESH TABLE t1 & MSCK repair table t1 

spark.sql("REFRESH table partitioned_flights ")

spark.sql("MSCK repair table  partitioned_flights ")
spark.sql("show partitions partitioned_flights ").show(2,False)

+-------------------------------+
|partition                      |
+-------------------------------+
|DEST_COUNTRY_NAME=Egypt        |
|DEST_COUNTRY_NAME=United States|
+-------------------------------+



In [38]:
# CACHE & UNCACHE 

spark.sql("CACHE table  partitioned_flights ")
spark.sql("UNCACHE table  partitioned_flights ")

DataFrame[]

In [40]:
# VIEWS 
# VIEWS _ Registerd in DB 
# TEMP VIEWS - Not registered 
# GLOBAL TEMP VIEWS - Not registerd . Available to spark sessions globally  

In [46]:
# DATBASES 
create_db = """
create database IF NOT Exists db1
"""
spark.sql (create_db)
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|      db1|
|  default|
+---------+



In [49]:
spark.sql("use db1 ")
spark.sql("show tables ").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [None]:
# FULL SQL SYNTAX 
# https://docs.databricks.com/spark/latest/spark-sql/language-manual/select.html


# COMPLEX DATA TYPES 

In [56]:
# struct , MAP , Functions 
spark.sql("use default ")


# STRUCT 
create_view = """
CREATE VIEW IF NOT EXISTS nested_data 
AS 
SELECT (DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME) as country, 
count 
FROM flights
"""
spark.sql(create_view)
sql = """
select * from nested_data
"""
spark.sql(sql).show(5, False)

+------------------------+-----+
|country                 |count|
+------------------------+-----+
|[United States, Romania]|15   |
|[United States, Croatia]|1    |
|[United States, Ireland]|344  |
|[Egypt, United States]  |15   |
|[United States, India]  |62   |
+------------------------+-----+
only showing top 5 rows



In [57]:
# Using "." Operator#

sql = """
select country.ORIGIN_COUNTRY_NAME , count  from nested_data
"""

spark.sql(sql).show(5, False)

+-------------------+-----+
|ORIGIN_COUNTRY_NAME|count|
+-------------------+-----+
|Romania            |15   |
|Croatia            |1    |
|Ireland            |344  |
|United States      |15   |
|India              |62   |
+-------------------+-----+
only showing top 5 rows



In [61]:
# LIST 
# COLLECT_LIST , COLLECT_SET , ARRAY 

sql = """
desc flights 
"""
spark.sql(sql).show()

sql = """
select DEST_COUNTRY_NAME,
COLLECT_LIST(count) as list_count,
COLLECT_SET(ORIGIN_COUNTRY_NAME) as set_country
from flights 
group by DEST_COUNTRY_NAME

"""

spark.sql(sql).show(3)

# Access first element in list 


+-------------------+---------+-------+
|           col_name|data_type|comment|
+-------------------+---------+-------+
|  DEST_COUNTRY_NAME|   string|   null|
|ORIGIN_COUNTRY_NAME|   string|   null|
|              count|   bigint|   null|
+-------------------+---------+-------+

+-----------------+--------------------+---------------+
|DEST_COUNTRY_NAME|          list_count|    set_country|
+-----------------+--------------------+---------------+
|             Chad|                 [1]|[United States]|
|         Anguilla|[21, 41, 22, 21, ...|[United States]|
|         Paraguay|[90, 60, 75, 85, ...|[United States]|
+-----------------+--------------------+---------------+
only showing top 3 rows



In [62]:
# First element in the list or set or Array 
sql = """
select DEST_COUNTRY_NAME,
COLLECT_LIST(count)[0] as list_count,
COLLECT_SET(ORIGIN_COUNTRY_NAME) as set_country
from flights 
group by DEST_COUNTRY_NAME

"""
spark.sql(sql).show(3)

+-----------------+----------+---------------+
|DEST_COUNTRY_NAME|list_count|    set_country|
+-----------------+----------+---------------+
|             Chad|         1|[United States]|
|         Anguilla|        21|[United States]|
|         Paraguay|        90|[United States]|
+-----------------+----------+---------------+
only showing top 3 rows



In [64]:
create_view = """
CREATE OR REPLACE TEMP VIEW flights_agg 
AS 
SELECT DEST_COUNTRY_NAME, 
collect_list( count) as collected_counts 
FROM flights 
GROUP BY DEST_COUNTRY_NAME

"""

spark.sql(create_view)


DataFrame[]

In [65]:
sql = """
select * from flights_agg
"""
spark.sql(sql).show(3)


+-----------------+--------------------+
|DEST_COUNTRY_NAME|    collected_counts|
+-----------------+--------------------+
|             Chad|                 [1]|
|         Anguilla|[21, 41, 22, 21, ...|
|         Paraguay|[90, 60, 75, 85, ...|
+-----------------+--------------------+
only showing top 3 rows



In [66]:
# Explode 
sql = """
select explode(collected_counts),
DEST_COUNTRY_NAME
From flights_agg
"""

spark.sql(sql).show()

+---+-----------------+
|col|DEST_COUNTRY_NAME|
+---+-----------------+
|  1|             Chad|
| 21|         Anguilla|
| 41|         Anguilla|
| 22|         Anguilla|
| 21|         Anguilla|
| 34|         Anguilla|
| 19|         Anguilla|
| 90|         Paraguay|
| 60|         Paraguay|
| 75|         Paraguay|
| 85|         Paraguay|
| 90|         Paraguay|
| 85|         Paraguay|
|152|           Russia|
|176|           Russia|
|194|           Russia|
|199|           Russia|
|213|           Russia|
|183|           Russia|
|  1|            Yemen|
+---+-----------------+
only showing top 20 rows



# Functions 

In [67]:
sql = """
show functions
"""
spark.sql(sql).show()

+--------+
|function|
+--------+
|       !|
|      !=|
|       %|
|       &|
|       *|
|       +|
|       -|
|       /|
|       <|
|      <=|
|     <=>|
|      <>|
|       =|
|      ==|
|       >|
|      >=|
|       ^|
|     abs|
|    acos|
|   acosh|
+--------+
only showing top 20 rows

