In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [8]:
spark = (SparkSession
        .builder
        .appName("departuresDBcreate")
        .getOrCreate())

## Creating SQL Databases & Tables

In [9]:
### First we'll create the database itself
spark.sql("CREATE DATABASE learn_spark_db")
spark.sql("USE learn_spark_db")

DataFrame[]

#### Creating a Managed Table

In [4]:
### Method 1: Issues a SQL statement. We'll use method 2 so commenting this out.
#spark.sql("CREATE TABLE managed_us_delay_flights_tbl (date STRING, delay INT, distance INT, origin STRING, destination STRING)")

In [10]:
### Method 2: Using the DataFrame API

csv = "C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/datasets/departuredelays.csv"
schema = "date STRING, delay INT, distance INT, origin STRING, destination STRING"

flights_df = spark.read.csv(csv, schema=schema)
flights_df.write.saveAsTable("managed_us_delay_flights_tbl")

#### Creating an Unmanaged Table

In [12]:
(flights_df
    .write
    .option("path", "/tmp/data/us_flights_delay")
    .saveAsTable("us_delay_flights_tbl")
    )

## Temporary Views

Temporary views in Spark SQL are session-scoped and will disappear if the session that creates it terminates. If we want to have a temporary view that is shared among all sessions and keep alive until the Spark application terminates, we can create a global temporary view.

#### Creating Views

In [13]:
### Creating Views: Can be done within dataframe API:

df_sfo = spark.sql("""SELECT date, delay, origin
                      FROM us_delay_flights_tbl WHERE origin = 'SFO'
                      """)

df_jfk = spark.sql("""SELECT date, delay, origin
                      FROM us_delay_flights_tbl WHERE origin = 'JFK'
                      """)

In [14]:
### Create a global temp view
df_sfo.createOrReplaceGlobalTempView("us_origin_airport_SFO_global_tmp_view")

### Create a temp view
df_jfk.createOrReplaceTempView("us_origin_airport_JFK_tmp_view")

In [22]:
### Can select from these views with standard SQL.
### For global we must add prefix 'global_temp.'
spark.sql("SELECT * FROM global_temp.us_origin_airport_SFO_global_tmp_view").show(5)

+--------+-----+------+
|    date|delay|origin|
+--------+-----+------+
|01011250|   55|   SFO|
|01012230|    0|   SFO|
|01010705|   -7|   SFO|
|01010620|   -3|   SFO|
|01010915|   -3|   SFO|
+--------+-----+------+
only showing top 5 rows



In [23]:
### For regular view we can drop the global_temp.
spark.sql("SELECT * FROM us_origin_airport_JFK_tmp_view").show(5)

+--------+-----+------+
|    date|delay|origin|
+--------+-----+------+
|02010900|   -1|   JFK|
|02011200|   -5|   JFK|
|02011030|   -6|   JFK|
|02011900|   -1|   JFK|
|02011700|   -3|   JFK|
+--------+-----+------+
only showing top 5 rows



In [24]:
### Can also use the dataframe API to select:
spark.read.table("us_origin_airport_JFK_tmp_view").show(5)

+--------+-----+------+
|    date|delay|origin|
+--------+-----+------+
|02010900|   -1|   JFK|
|02011200|   -5|   JFK|
|02011030|   -6|   JFK|
|02011900|   -1|   JFK|
|02011700|   -3|   JFK|
+--------+-----+------+
only showing top 5 rows



#### Dropping Views

In [25]:
### Drop just like we would a table
###DROP VIEW IF EXISTS us_origin_airport_SFO_global_tmp_view;
###DROP VIEW IF EXISTS us_origin_airport_JFK_tmp_view

spark.catalog.dropGlobalTempView("us_origin_airport_SFO_global_tmp_view")
spark.catalog.dropTempView("us_origin_airport_JFK_tmp_view")

True

To re-iterate global temp vs temp views: Temp views are tied to a single spark session, while, a Global Temp view is visible across multiple spark sessions within a spark application.

## Viewing Medata

In [31]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/notebooks/spark-warehouse'),
 Database(name='learn_spark_db', description='', locationUri='file:/C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/notebooks/spark-warehouse/learn_spark_db.db')]

In [32]:
spark.catalog.listTables()

[Table(name='managed_us_delay_flights_tbl', database='learn_spark_db', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='us_delay_flights_tbl', database='learn_spark_db', description=None, tableType='EXTERNAL', isTemporary=False)]

In [33]:
spark.catalog.listColumns("us_delay_flights_tbl")

[Column(name='date', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='delay', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='distance', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='origin', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='destination', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False)]

## Reading Tables into DataFrames

In [35]:
### Two methods: Use SQL or the Datframe API
us_flights_df = spark.sql("SELECT * FROM us_delay_flights_tbl")
us_flights_df2 = spark.table("us_delay_flights_tbl")

In [36]:
us_flights_df.show(5)

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01201755|    0|     449|   ORF|        ATL|
|01201610|   52|     449|   ORF|        ATL|
|01201441|    0|     449|   ORF|        ATL|
|01211755|  -15|     449|   ORF|        ATL|
|01210941|   -5|     449|   ORF|        ATL|
+--------+-----+--------+------+-----------+
only showing top 5 rows



In [37]:
us_flights_df2.show(5)

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01201755|    0|     449|   ORF|        ATL|
|01201610|   52|     449|   ORF|        ATL|
|01201441|    0|     449|   ORF|        ATL|
|01211755|  -15|     449|   ORF|        ATL|
|01210941|   -5|     449|   ORF|        ATL|
+--------+-----+--------+------+-----------+
only showing top 5 rows

