### Exploring SQL query options

In [1]:
from pyspark.sql.types import Row
from datetime import datetime
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.session import SparkSession

In [2]:
sc = SparkContext()
sc

21/12/20 16:20:56 WARN Utils: Your hostname, srimac.local resolves to a loopback address: 127.0.0.1; using 192.168.1.10 instead (on interface en0)
21/12/20 16:20:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/20 16:20:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

#### Creating a dataframe with different data types

In [5]:
movies_record = sc.parallelize([Row(certificate_id = 2195194,
                                    movie_name = "Iron Man",
                                    hit = True,
                                    category = ['action','thriller'],
                                    rating = {"IMDb": 7.9, 'rotten tomatoes': 7.7},
                                    release_time = datetime(2008, 5, 1, 13, 1, 5)),
                                Row(certificate_id = 2195204,
                                    movie_name = "Baywatch",
                                    hit = False,
                                    category = ['comedy','action'],
                                    rating = {"IMDb": 5.6, 'rotten tomatoes': 4.0},
                                    release_time = datetime(2017, 5, 12, 14, 2, 5))
])

In [6]:
movies_record_df = movies_record.toDF()
movies_record_df.show()

                                                                                

+--------------+----------+-----+------------------+--------------------+-------------------+
|certificate_id|movie_name|  hit|          category|              rating|       release_time|
+--------------+----------+-----+------------------+--------------------+-------------------+
|       2195194|  Iron Man| true|[action, thriller]|{IMDb -> 7.9, rot...|2008-05-01 13:01:05|
|       2195204|  Baywatch|false|  [comedy, action]|{IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+--------------+----------+-----+------------------+--------------------+-------------------+



#### Register the dataframe as a temporary view

* The view is valid for one session
* This is required to run SQL commands on the dataframe

In [7]:
movies_record_df.createOrReplaceTempView('records')

In [8]:
all_movie_records_df = sqlContext.sql('SELECT * FROM records')

all_movie_records_df.show()

+--------------+----------+-----+------------------+--------------------+-------------------+
|certificate_id|movie_name|  hit|          category|              rating|       release_time|
+--------------+----------+-----+------------------+--------------------+-------------------+
|       2195194|  Iron Man| true|[action, thriller]|{IMDb -> 7.9, rot...|2008-05-01 13:01:05|
|       2195204|  Baywatch|false|  [comedy, action]|{IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+--------------+----------+-----+------------------+--------------------+-------------------+



In [9]:
sqlContext.sql('SELECT certificate_id, category[1], rating["IMDb"] \
                FROM records').show()

+--------------+-----------+------------+
|certificate_id|category[1]|rating[IMDb]|
+--------------+-----------+------------+
|       2195194|   thriller|         7.9|
|       2195204|     action|         5.6|
+--------------+-----------+------------+



In [9]:
sqlContext.sql('SELECT certificate_id, NOT hit \
                FROM records').show()

+--------------+---------+
|certificate_id|(NOT hit)|
+--------------+---------+
|       2195194|    false|
|       2195204|     true|
+--------------+---------+



### Conditional statements in SQL 

In [10]:
sqlContext.sql('SELECT * FROM records \
                WHERE NOT hit').show()

+----------------+--------------+-----+----------+--------------------+-------------------+
|        category|certificate_id|  hit|movie_name|              rating|       release_time|
+----------------+--------------+-----+----------+--------------------+-------------------+
|[comedy, action]|       2195204|false|  Baywatch|[IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+----------------+--------------+-----+----------+--------------------+-------------------+



In [11]:
sqlContext.sql('SELECT * FROM records \
                WHERE rating["IMDb"] < 6.0').show()

+----------------+--------------+-----+----------+--------------------+-------------------+
|        category|certificate_id|  hit|movie_name|              rating|       release_time|
+----------------+--------------+-----+----------+--------------------+-------------------+
|[comedy, action]|       2195204|false|  Baywatch|[IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+----------------+--------------+-----+----------+--------------------+-------------------+



In [12]:
sqlContext.sql('SELECT * FROM records \
                WHERE release_time >= \'2010-05-01 0:0:0\'').show()

+----------------+--------------+-----+----------+--------------------+-------------------+
|        category|certificate_id|  hit|movie_name|              rating|       release_time|
+----------------+--------------+-----+----------+--------------------+-------------------+
|[comedy, action]|       2195204|false|  Baywatch|[IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+----------------+--------------+-----+----------+--------------------+-------------------+



#### Global temporary view

* Temporary view shared across multiple sessions
* Kept alive till the Spark application terminates

In [11]:
movies_record_df.createGlobalTempView('global_records')

In [12]:
sqlContext.sql('SELECT * FROM global_temp.global_records').show()

+--------------+----------+-----+------------------+--------------------+-------------------+
|certificate_id|movie_name|  hit|          category|              rating|       release_time|
+--------------+----------+-----+------------------+--------------------+-------------------+
|       2195194|  Iron Man| true|[action, thriller]|{IMDb -> 7.9, rot...|2008-05-01 13:01:05|
|       2195204|  Baywatch|false|  [comedy, action]|{IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+--------------+----------+-----+------------------+--------------------+-------------------+



21/12/21 02:05:25 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 13710245 ms exceeds timeout 120000 ms
21/12/21 02:05:25 WARN SparkContext: Killing executors is not supported by current scheduler.
