In [1]:
from pyspark.sql import SparkSession
import os

os.environ["HADOOP_HOME"] = "C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2"
os.environ["HIVE_HOME"] = "C:\\Users\\SkJain\\Documents\\BigDataStackWorkspace\\SparkLearn\\apache-hive-3.1.3-bin"
os.environ["HIVE_LIB"] = "C:\\Users\\SkJain\\Documents\\BigDataStackWorkspace\\SparkLearn\\apache-hive-3.1.3-bin\\lib"
os.environ["HIVE_BIN"] =  "C:\\Users\\SkJain\\Documents\\BigDataStackWorkspace\\SparkLearn\\apache-hive-3.1.3-bin\\bin"
os.environ["HADOOP_USER_CLASSPATH_FIRST"] = "true"

In [None]:
spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName('SparkSql'). \
    master('local'). \
    getOrCreate()

### Creating table

In [4]:
# first we should do all pre requisites like creating db if it doen't exist, 
# then switching to the correct db and drop the table which we are going to create in case it already exists
# we can use "SHOW TABLES" to see all the tables in current database

#now to create the table
create_order_query = """ CREATE TABLE ORDERS (
    order_id INT,
    order_date STRING,
    order_cust_id INT,
    order_status STRING
    ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
"""
# by default also it stores as textfile, so we can skip that line also
spark.sql(create_order_query)

# to insert data into the table, we can use the insert or load command
# insert command is used to manually add one or few records. In real'world scenarios it is less used 
# since we would be mostly uploading files which can be done using load command

# to insert single record using insert command
# INSERT INTO orders VALUES (col1_value, col2_value ...)

#to insert multiple records we can just write them comma separated
# INSERT INTO orders VALUES (col1_value1, col2_value1 ...), (col1_value2, col2_value2 ...)

# in spark sql we can't partially provide values to only some of the columns like in traditional rdbs. 
# But this feature is present in hive

# if we provide values in wrong order or anyother scenario where the data type of column and value provided 
# do not match, it will not throw an error but simply store null in place of that value. If typecasting can
# happen then it will store the value, so if we give an int where string is expected, it will be typecasted 

### Data Types in spark sql
 - hive and spark sql are almost same, so most things will apply to hive as well

In [None]:
# from hive manual: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types

# check create table from here, it explain all lines we write in the query: 
# https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTableCreate/Drop/TruncateTable

#more complex query, will create tsv file instead of csv and containg arrays and objects as column values
query = """ CREATE TABLE students(
    student_id INT,
    student_first_name STRING,
    student_last_name STRING,
    student_phone_numbers ARRAY<STRING>,
    student_address STRUCT<street: STRING, city: STRING, state: STRING, zip: STRING>
    ) STORED AS TEXTFILE
    ROW FORMAT
    DELIMITED FIELDS TERMIANTED BY '\t'
    COLLECTION ITEMS TERMINATED BY ','"""

# for each insert statement of new file is created
# in the file when we check array values and struct object values will be separated by commas as we mentioned in the query

In [None]:
# we can add comment for each column while creating table
# these comments will be visible when we use the command "DESCRIBE tablename"
# we can add it using COMMNET keuword
# it can be at both table level as well as column level
# to see table level comment w can use "DESCRIBE FORMATTED tablename"

#eg.
create_order_query = """ CREATE TABLE ORDERS (
    order_id INT COMMENT 'Unique order id',
    order_date STRING COMMENT 'date on which order was placed',
    order_cust_id INT COMMENT 'customer who placed this order',
    order_status STRING COMMENT 'current status of the order'
    ) COMMENT 'Table to store details of orders' 
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
"""

In [None]:
#loading data into tables
#the file format we specify in create table query is about how the data will be saved in hive
#but it is preferred to load data from an input file in same format, otherwise it won't throw errors but 
#it will start storing incorrect results

#to load data from local file system (LOCAL)
load_query = "LOAD DATA LOCAL INPATH 'filepathOrFolderPath' INTO TABLE ORDERS"

#to load data from HDFS file system (without LOCAL keyword)
#user should have write permission on the source
#data will be MOVED to spark metastore (deleted from source)
load_query = "LOAD DATA INPATH 'filepathOrFolderPath' INTO TABLE ORDERS"

# append data while loading to an existing table or overwrite all the current data
# the query used till now appends the data, 
load_query_append = "LOAD DATA INPATH 'filepathOrFolderPath' INTO TABLE ORDERS"
#if the name of the file from which we are loading the data is already present it, it will append _copy_1, _copy_2
# and so on to the filename

#to overwrite the table we need specify OVERWRITE keyword before INTO
load_query_overwrite = "LOAD DATA INPATH 'filepathOrFolderPath' OVERWRITE INTO TABLE ORDERS"

In [None]:
# External tables in hive
# we need to add EXTERNAL keywword after CREATE  while creating a new table
# we also need to specify LOCATION at the end which will contain the path of folder in hdfs
#It will not be stored in spark metastore but at the mentioned location
create_order_query = """ CREATE EXTERNAL TABLE ORDERS (
    order_id INT,
    order_date STRING,
    order_cust_id INT,
    order_status STRING
    ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
    LOCATION 'FOLDERpATH'
"""

# When we drop a managed table. it deleted all metadata from spark metastore 
# as well as the actual data from hdfs for this table
#For external tables, only metadata from spark metastore is deleted, but the actual data is preserved in that path
#Typically extrernal tables are used when same data is processed by multiple frameworks like pig, spark etc.
# we cannnot run TRUNCATE command agaonst external tables

In [None]:
#File formats in hive
# file_format:
#   : SEQUENCEFILE (outdated)
#   | TEXTFILE    -- (Default, depending on hive.default.fileformat configuration)
#   | RCFILE      -- (Note: Available in Hive 0.6.0 and later)
#   | ORC         -- (Note: Available in Hive 0.11.0 and later)
#   | PARQUET     -- (Note: Available in Hive 0.13.0 and later)
#   | AVRO        -- (Note: Available in Hive 0.14.0 and later)
#   | JSONFILE    -- (Note: Available in Hive 4.0.0 and later)
#   | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname

# ORC, parquet and avro are most popular among all file formals

query_avro = """ CREATE TABLE students(
    student_id INT,
    student_first_name STRING,
    student_last_name STRING,
    student_phone_numbers ARRAY<STRING>,
    student_address STRUCT<street: STRING, city: STRING, state: STRING, zip: STRING>
    ) STORED AS AVRO"""

query_parquet = """ CREATE TABLE students(
    student_id INT,
    student_first_name STRING,
    student_last_name STRING,
    student_phone_numbers ARRAY<STRING>,
    student_address STRUCT<street: STRING, city: STRING, state: STRING, zip: STRING>
    ) STORED AS PARQUET"""

#PARQUET FILES BY DEFULT use snappy as compression. so files extension will be .snappy.parquet

In [None]:
#to drop table
# DROP TABLE tablename
# DROP TABLE IF EXISTS tablename

#DROP DATABASE IF EXISTS dbname
#DROP DATABASE IF EXISTS dbname CASCADE

#truncate TABLE (data is removed but structure of table is preserved)
#only works for managed tables
TRUNCATE TABLE tablename