### Exploring SQL query options

In [1]:
from pyspark.sql.types import Row
from datetime import datetime
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.session import SparkSession

In [2]:
sc = SparkContext()
sc

In [3]:
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

#### Creating a dataframe with different data types

In [4]:
movies_record = sc.parallelize([Row(certificate_id = 2195194,
                                    movie_name = "Iron Man",
                                    hit = True,
                                    category = ['action','thriller'],
                                    rating = {"IMDb": 7.9, 'rotten tomatoes': 7.7},
                                    release_time = datetime(2008, 5, 1, 13, 1, 5)),
                                Row(certificate_id = 2195204,
                                    movie_name = "Baywatch",
                                    hit = False,
                                    category = ['comedy','action'],
                                    rating = {"IMDb": 5.6, 'rotten tomatoes': 4.0},
                                    release_time = datetime(2017, 5, 12, 14, 2, 5))
])

In [5]:
movies_record_df = movies_record.toDF()
movies_record_df.show()

+------------------+--------------+-----+----------+--------------------+-------------------+
|          category|certificate_id|  hit|movie_name|              rating|       release_time|
+------------------+--------------+-----+----------+--------------------+-------------------+
|[action, thriller]|       2195194| true|  Iron Man|[IMDb -> 7.9, rot...|2008-05-01 13:01:05|
|  [comedy, action]|       2195204|false|  Baywatch|[IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+------------------+--------------+-----+----------+--------------------+-------------------+



#### Register the dataframe as a temporary view

* The view is valid for one session
* This is required to run SQL commands on the dataframe

In [6]:
movies_record_df.createOrReplaceTempView('records')

In [7]:
all_movie_records_df = sqlContext.sql('SELECT * FROM records')

all_movie_records_df.show()

+------------------+--------------+-----+----------+--------------------+-------------------+
|          category|certificate_id|  hit|movie_name|              rating|       release_time|
+------------------+--------------+-----+----------+--------------------+-------------------+
|[action, thriller]|       2195194| true|  Iron Man|[IMDb -> 7.9, rot...|2008-05-01 13:01:05|
|  [comedy, action]|       2195204|false|  Baywatch|[IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+------------------+--------------+-----+----------+--------------------+-------------------+



In [8]:
sqlContext.sql('SELECT certificate_id, category[1], rating["IMDb"] \
                FROM records').show()

+--------------+-----------+------------+
|certificate_id|category[1]|rating[IMDb]|
+--------------+-----------+------------+
|       2195194|   thriller|         7.9|
|       2195204|     action|         5.6|
+--------------+-----------+------------+



In [9]:
sqlContext.sql('SELECT certificate_id, NOT hit \
                FROM records').show()

+--------------+---------+
|certificate_id|(NOT hit)|
+--------------+---------+
|       2195194|    false|
|       2195204|     true|
+--------------+---------+



### Conditional statements in SQL 

In [10]:
sqlContext.sql('SELECT * FROM records \
                WHERE NOT hit').show()

+----------------+--------------+-----+----------+--------------------+-------------------+
|        category|certificate_id|  hit|movie_name|              rating|       release_time|
+----------------+--------------+-----+----------+--------------------+-------------------+
|[comedy, action]|       2195204|false|  Baywatch|[IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+----------------+--------------+-----+----------+--------------------+-------------------+



In [11]:
sqlContext.sql('SELECT * FROM records \
                WHERE rating["IMDb"] < 6.0').show()

+----------------+--------------+-----+----------+--------------------+-------------------+
|        category|certificate_id|  hit|movie_name|              rating|       release_time|
+----------------+--------------+-----+----------+--------------------+-------------------+
|[comedy, action]|       2195204|false|  Baywatch|[IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+----------------+--------------+-----+----------+--------------------+-------------------+



In [12]:
sqlContext.sql('SELECT * FROM records \
                WHERE release_time >= \'2010-05-01 0:0:0\'').show()

+----------------+--------------+-----+----------+--------------------+-------------------+
|        category|certificate_id|  hit|movie_name|              rating|       release_time|
+----------------+--------------+-----+----------+--------------------+-------------------+
|[comedy, action]|       2195204|false|  Baywatch|[IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+----------------+--------------+-----+----------+--------------------+-------------------+



#### Global temporary view

* Temporary view shared across multiple sessions
* Kept alive till the Spark application terminates

In [13]:
movies_record_df.createGlobalTempView('global_records')

In [14]:
sqlContext.sql('SELECT * FROM global_temp.global_records').show()

+------------------+--------------+-----+----------+--------------------+-------------------+
|          category|certificate_id|  hit|movie_name|              rating|       release_time|
+------------------+--------------+-----+----------+--------------------+-------------------+
|[action, thriller]|       2195194| true|  Iron Man|[IMDb -> 7.9, rot...|2008-05-01 13:01:05|
|  [comedy, action]|       2195204|false|  Baywatch|[IMDb -> 5.6, rot...|2017-05-12 14:02:05|
+------------------+--------------+-----+----------+--------------------+-------------------+

