In [1]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Spark SQL")
    .master("local[*]")
    .enableHiveSupport()
    .config("spark.sql.warehouse.dir", "/data/output/spark-warehouse")
    .getOrCreate()
)

spark

In [9]:
# Read Employee data
_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load("/data/input/employee_records_skewed.csv")

In [10]:
# Read DEPT CSV data
_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv").schema(_dept_schema).option("header", True).load("/data/input/department_data.csv")

In [2]:
# Spark Catalog (Metadata) - in-memory/hive

spark.conf.get("spark.sql.catalogImplementation")

'hive'

In [3]:
# Show databases
db = spark.sql("show databases")
db.show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [4]:
spark.sql("show tables in default").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|emp_final|      false|
+---------+---------+-----------+



In [12]:
# Register dataframes are temp views

emp.createOrReplaceTempView("emp_view")

dept.createOrReplaceTempView("dept_view")


In [None]:
# Show tables/view in catalog



In [16]:
# View data from table

emp_filtered = spark.sql("""
    select * from emp_view
    where department_id = 1
""")

In [17]:
emp_filtered.show()

+-----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
| first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|
+-----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
|       Carl|  Peterson|         Proofreader|1984-11-23|andrew20@example.net|   241-871-9102x3835|287728.0|            1|
|   Kristina|    Martin|       IT consultant|1964-02-23|autumn05@example.com|       (625)327-0615|563768.0|            1|
|   Benjamin|     Lopez|Agricultural engi...|1966-01-20|  ryan46@example.org| +1-256-376-8069x339|891725.0|            1|
|     Leslie| Rodriguez|Horticulturist, c...|1973-06-16|thomassutton@exam...|001-630-539-4136x...|875940.0|            1|
|     Angela|    Martin|   Company secretary|1979-07-07| qholmes@example.org|001-267-831-8987x...|485302.0|            1|
|      Julia|     Gomez|

In [21]:
# Create a new column dob_year and register as temp view

emp_temp = spark.sql("""
    select e.*, date_format(dob, 'yyyy') as dob_year from emp_view e
""")


In [22]:
emp_temp.createOrReplaceTempView("emp_temp_view")

In [24]:
spark.sql("select * from emp_temp_view").show()

+----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+--------+
|first_name|last_name|           job_title|       dob|               email|               phone|  salary|department_id|dob_year|
+----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+--------+
|  Samantha|    Brown|Diagnostic radiog...|1966-06-11| jwatson@example.com|       (428)806-5154|439679.0|            3|    1966|
|    Justin|Castaneda|Human resources o...|1996-11-11|  sdavis@example.org|    001-581-642-9621| 97388.0|            4|    1996|
|      Carl| Peterson|         Proofreader|1984-11-23|andrew20@example.net|   241-871-9102x3835|287728.0|            1|    1984|
| Catherine|     Lane|    Location manager|1966-06-21|elizabethalexande...|   470.866.4415x0739|174151.0|            3|    1966|
|     Aaron|  Delgado|Teacher, secondar...|1972-10-11|uwilliams@example...|   384.336.5759x4831|2

In [13]:
# Join emp and dept - HINTs

emp_final = spark.sql("""
    select /*+ BROADCAST(d) */
    e.* , d.department_name
    from emp_view e left outer join dept_view d
    on e.department_id = d.department_id
""")

In [9]:
# Show emp data

emp_final.show()

+----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------------+
|first_name|last_name|           job_title|       dob|               email|               phone|  salary|department_id|     department_name|
+----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------------+
|  Samantha|    Brown|Diagnostic radiog...|1966-06-11| jwatson@example.com|       (428)806-5154|439679.0|            3|Pittman, Hess and...|
|    Justin|Castaneda|Human resources o...|1996-11-11|  sdavis@example.org|    001-581-642-9621| 97388.0|            4|Smith, Snyder and...|
|      Carl| Peterson|         Proofreader|1984-11-23|andrew20@example.net|   241-871-9102x3835|287728.0|            1|         Bryan-James|
| Catherine|     Lane|    Location manager|1966-06-21|elizabethalexande...|   470.866.4415x0739|174151.0|            3|Pittman, Hess and...|
|     Aaron| 

In [14]:
# Write the data as Table

emp_final.write.format("parquet").saveAsTable("emp_final")

In [5]:
# Read the data from Table

emp_new = spark.sql("select * from emp_final")

In [6]:
emp_new.show()

+----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------------+
|first_name|last_name|           job_title|       dob|               email|               phone|  salary|department_id|     department_name|
+----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------------+
|  Samantha|    Brown|Diagnostic radiog...|1966-06-11| jwatson@example.com|       (428)806-5154|439679.0|            3|Pittman, Hess and...|
|    Justin|Castaneda|Human resources o...|1996-11-11|  sdavis@example.org|    001-581-642-9621| 97388.0|            4|Smith, Snyder and...|
|      Carl| Peterson|         Proofreader|1984-11-23|andrew20@example.net|   241-871-9102x3835|287728.0|            1|         Bryan-James|
| Catherine|     Lane|    Location manager|1966-06-21|elizabethalexande...|   470.866.4415x0739|174151.0|            3|Pittman, Hess and...|
|     Aaron| 

In [None]:
# Persist metadata



In [8]:
# Show details of metadata

spark.sql("describe extended emp_final").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|          first_name|              string|   null|
|           last_name|              string|   null|
|           job_title|              string|   null|
|                 dob|              string|   null|
|               email|              string|   null|
|               phone|              string|   null|
|              salary|              double|   null|
|       department_id|                 int|   null|
|     department_name|              string|   null|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|            Database|             default|       |
|               Table|           emp_final|       |
|               Owner|                root|       |
|        Created Time|Sat Jan 06 10:03:...|       |
|         Last Access|             UNKNOWN|       |
|          C