In [1]:
import os
import sys
from pyspark.sql import SparkSession

os.environ["HADOOP_HOME"] = "C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2"
sys.path.append('C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2\\bin')

In [2]:
spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName('SparkSql'). \
    master('local'). \
    getOrCreate()

In [3]:
def executeQuery(query):
    return spark.sql(query)

In [48]:
see_databases = "SHOW DATABASES"
createdb = "CREATE DATABASE IF NOT EXISTS siddhantdb"
select_database = "USE siddhantdb"
check_current_db = "SELECT current_database()"
see_tables = "SHOW TABLES"
drop_table = "DROP TABLE IF EXISTS {}"

orders_table = "orders"
orders_partitioned_table = "orders_partitioned"
orders_data_path = './datasets/orders'

create_order_table = """ CREATE TABLE IF NOT EXISTS orders (
    order_id INT,
    order_date STRING,
    order_cust_id INT,
    order_status STRING
    ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
"""

create_order_table_partitioned = """ CREATE TABLE IF NOT EXISTS orders_partitioned (
    order_id INT,
    order_date STRING,
    order_cust_id INT,
    order_status STRING
    ) PARTITIONED BY (order_month STRING) 
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
"""

load_data = "LOAD DATA LOCAL INPATH '{}' INTO TABLE {}"

load_into_orders_patition = "LOAD DATA LOCAL INPATH filepath/order_07_file INTO TABLE orders_partitioned PARTITION (order_month = '2013-07')"
insert_into_orders_patition = "INSERT INTO TABLE orders_partitioned PARTITION(order_month = '2013-11') SELECT * FROM orders WHERE order_date LIKE '2013-11%'"

see_top_n_data = "SELECT * FROM {} LIMIT {}"
records_count = "SELECT COUNT(1) FROM {}"
remove_data = "TRUNCATE TABLE {}"

In [5]:
executeQuery(see_databases).show()

+----------+
| namespace|
+----------+
|   default|
|    nysedb|
|siddhantdb|
+----------+



In [6]:
executeQuery(check_current_db).show()

+------------------+
|current_database()|
+------------------+
|           default|
+------------------+



In [7]:
executeQuery(createdb).show()

++
||
++
++



In [8]:
executeQuery(select_database).show()

++
||
++
++



In [9]:
executeQuery(check_current_db).show()

+------------------+
|current_database()|
+------------------+
|        siddhantdb|
+------------------+



In [11]:
executeQuery(see_tables).show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [17]:
executeQuery(drop_table.format(orders_table)).show()

++
||
++
++



In [19]:
executeQuery(create_order_table).show()

++
||
++
++



In [20]:
executeQuery(see_tables).show()

+----------+---------+-----------+
|  database|tableName|isTemporary|
+----------+---------+-----------+
|siddhantdb|   orders|      false|
+----------+---------+-----------+



In [27]:
executeQuery(load_data.format(orders_data_path, orders_table)).show()

++
||
++
++



In [31]:
executeQuery(see_top_n_data.format(orders_table, '10')).show()

+--------+--------------------+-------------+---------------+
|order_id|          order_date|order_cust_id|   order_status|
+--------+--------------------+-------------+---------------+
|       1|2013-07-25 00:00:...|        11599|         CLOSED|
|       2|2013-07-25 00:00:...|          256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|        12111|       COMPLETE|
|       4|2013-07-25 00:00:...|         8827|         CLOSED|
|       5|2013-07-25 00:00:...|        11318|       COMPLETE|
|       6|2013-07-25 00:00:...|         7130|       COMPLETE|
|       7|2013-07-25 00:00:...|         4530|       COMPLETE|
|       8|2013-07-25 00:00:...|         2911|     PROCESSING|
|       9|2013-07-25 00:00:...|         5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|         5648|PENDING_PAYMENT|
+--------+--------------------+-------------+---------------+



In [33]:
executeQuery(records_count.format(orders_table)).show()

+--------+
|count(1)|
+--------+
|   68883|
+--------+



## Partitioning

- each partition is equal to a particular value of a given column
- spark sql doesn't support range partitioning and bucketing. bucketing is supported in hive
- once table is created, we can add static partitions and load data into them
- spark metastore and hive both support dynamic partitioning as well, where partition will be created based on value of partition column
- partitioned table can be both managed or external

## Load vs Insert
- LOAD will copy files by dividing them into blocks
- LOAD is fastest way of getting data into spark metastore but there's minimum validation at file level
- not transformations or validations can be done at data level
- if any transformations is required while getting data to spark metastore, we need to use INSERT command
- Usage scenario of insert
    - change delimiters of i/p file
    - change file format
    - load data into partitioned tables
    - apply any other transformation at data level

- if we load file of any other format (say text file) to a table which is stored as some other format (say parquet), the load command will run successfully without throwing any errors (since no validation)
- in the table folder in hdfs, we can see that file actually got copied also
- but when we run the select command then it will saying that a file is not in parquet format

### Inserting data using a stage table (to resolve the above issue)
- create a stage table(temp) with the file format same as data (textfile in this eg.)
- get all the data to this stage table (using load command since same file format)
- then put all the data from stage table to main table (using insert command)
    > INSERT INTO order_items SELECT * FROM order_items_stage

## Create Partitioned tables
- to check the query see value of this variable: create_order_table_partitioned
- since input data has 4 column and final table will have 5 (order_month will also be included) we cannot directly use LOAD command
- full example below

In [34]:
executeQuery(see_databases).show()

+----------+
| namespace|
+----------+
|   default|
|    nysedb|
|siddhantdb|
+----------+



In [35]:
executeQuery(check_current_db).show()

+------------------+
|current_database()|
+------------------+
|        siddhantdb|
+------------------+



In [36]:
executeQuery(see_tables).show()

+----------+---------+-----------+
|  database|tableName|isTemporary|
+----------+---------+-----------+
|siddhantdb|   orders|      false|
+----------+---------+-----------+



In [37]:
executeQuery(see_top_n_data.format(orders_table, '10')).show()

+--------+--------------------+-------------+---------------+
|order_id|          order_date|order_cust_id|   order_status|
+--------+--------------------+-------------+---------------+
|       1|2013-07-25 00:00:...|        11599|         CLOSED|
|       2|2013-07-25 00:00:...|          256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|        12111|       COMPLETE|
|       4|2013-07-25 00:00:...|         8827|         CLOSED|
|       5|2013-07-25 00:00:...|        11318|       COMPLETE|
|       6|2013-07-25 00:00:...|         7130|       COMPLETE|
|       7|2013-07-25 00:00:...|         4530|       COMPLETE|
|       8|2013-07-25 00:00:...|         2911|     PROCESSING|
|       9|2013-07-25 00:00:...|         5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|         5648|PENDING_PAYMENT|
+--------+--------------------+-------------+---------------+



In [51]:
executeQuery(drop_table.format(orders_partitioned_table))
executeQuery(see_tables).show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [52]:
executeQuery(create_order_table_partitioned).show()

++
||
++
++



In [44]:
executeQuery(see_tables).show()

+----------+------------------+-----------+
|  database|         tableName|isTemporary|
+----------+------------------+-----------+
|siddhantdb|orders_partitioned|      false|
+----------+------------------+-----------+



In [53]:
executeQuery(f"DESCRIBE FORMATTED {orders_partitioned_table}").show(200, False)

+----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                                                                        |comment|
+----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|order_id                    |int                                                                                                                                              |null   |
|order_date                  |string                                                                                                                                           |null   |
|order_cust_id               |int                                          

### Adding partitons to tables 

In [54]:
#adding a static partition using alter table
add_static_part = f"ALTER TABLE {orders_partitioned_table} ADD PARTITION (order_month='2013-07')"

#if order_month was int and adding multiple partition in one quert
add_static_part_int = f"ALTER TABLE {orders_partitioned_table} ADD PARTITION (order_month=201307) PARTITION (order_month=201308) PARTITION (order_month=201309)"

### loading data into partitioned tables
- format and delimited should match. We need to pre partition the file on partition logic if we want to use LOAD command
- assuming we split order file into these 3 files for each month 07, 08, 09 using any custom logic, manually or python or any other way
- now we can load this data into partitioN using query in 'load_into_orders_patition' variable

## Inserting data into partitions
- not always practical to directly LOAD data into tables
- raw data would usually require some intial transformation before inserting to the table for which we use the stage table approach.
- now we can load this data into partitioN using query in 'insert_into_orders_patition' variable

## Dynamic Partitioning
- we don't need to pre-create the partitions. They'll be automatically created when we run an Insert command in table with dynamic partition
- we need to set property 'hive.exec.dynamic.partition' to true
- and we need to set 'hive.exec.dynamic.partition.mode' to nonstrict

In [55]:
spark.sql("SELECT current_database()").show()

+------------------+
|current_database()|
+------------------+
|        siddhantdb|
+------------------+



In [56]:
spark.sql("SHOW tables").show()

+----------+------------------+-----------+
|  database|         tableName|isTemporary|
+----------+------------------+-----------+
|siddhantdb|orders_partitioned|      false|
+----------+------------------+-----------+



In [57]:
spark.sql("DROP TABLE IF EXISTS orders_partitioned").show()

++
||
++
++



In [58]:
spark.sql(create_order_table).show()

++
||
++
++



In [59]:
spark.sql("SHOW tables").show()

+----------+---------+-----------+
|  database|tableName|isTemporary|
+----------+---------+-----------+
|siddhantdb|   orders|      false|
+----------+---------+-----------+



In [60]:
spark.sql(load_data.format(orders_data_path, orders_table)).show()

++
||
++
++



In [63]:
spark.sql("SELECT * FROM orders LIMIT 10").show(truncate=False)

+--------+---------------------+-------------+---------------+
|order_id|order_date           |order_cust_id|order_status   |
+--------+---------------------+-------------+---------------+
|1       |2013-07-25 00:00:00.0|11599        |CLOSED         |
|2       |2013-07-25 00:00:00.0|256          |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111        |COMPLETE       |
|4       |2013-07-25 00:00:00.0|8827         |CLOSED         |
|5       |2013-07-25 00:00:00.0|11318        |COMPLETE       |
|6       |2013-07-25 00:00:00.0|7130         |COMPLETE       |
|7       |2013-07-25 00:00:00.0|4530         |COMPLETE       |
|8       |2013-07-25 00:00:00.0|2911         |PROCESSING     |
|9       |2013-07-25 00:00:00.0|5657         |PENDING_PAYMENT|
|10      |2013-07-25 00:00:00.0|5648         |PENDING_PAYMENT|
+--------+---------------------+-------------+---------------+



In [64]:
spark.sql("SELECT count(1) FROM orders").show()

+--------+
|count(1)|
+--------+
|   68883|
+--------+



In [72]:
spark.sql(create_order_table_partitioned).show()

++
||
++
++



In [73]:
spark.sql("SHOW tables").show()

+----------+------------------+-----------+
|  database|         tableName|isTemporary|
+----------+------------------+-----------+
|siddhantdb|            orders|      false|
|siddhantdb|orders_partitioned|      false|
+----------+------------------+-----------+



### Dynamic partition part

In [65]:
spark.sql("SET hive.exec.dynamic.partition").show() #to check the current value

+--------------------+-----------+
|                 key|      value|
+--------------------+-----------+
|hive.exec.dynamic...|<undefined>|
+--------------------+-----------+



In [66]:
spark.sql("SET hive.exec.dynamic.partition.mode").show()

+--------------------+-----------+
|                 key|      value|
+--------------------+-----------+
|hive.exec.dynamic...|<undefined>|
+--------------------+-----------+



In [67]:
spark.sql("SET hive.exec.dynamic.partition=true")
spark.sql("SET hive.exec.dynamic.partition.mode=nonstrict")

DataFrame[key: string, value: string]

In [69]:
spark.sql("SET hive.exec.dynamic.partition").show()

+--------------------+-----+
|                 key|value|
+--------------------+-----+
|hive.exec.dynamic...| true|
+--------------------+-----+



In [70]:
spark.sql("SET hive.exec.dynamic.partition.mode").show()

+--------------------+---------+
|                 key|    value|
+--------------------+---------+
|hive.exec.dynamic...|nonstrict|
+--------------------+---------+



In [75]:
insert_query= """ INSERT INTO TABLE orders_partitioned PARTITION (order_month) 
    SELECT o.*, DATE_FORMAT(order_date, 'yyyy-MM') order_month from orders o"""

spark.sql(insert_query).show()

++
||
++
++



In [78]:
spark.sql("SELECT * FROM orders_partitioned LIMIT 20").show(200, truncate=False)

+--------+---------------------+-------------+---------------+-----------+
|order_id|order_date           |order_cust_id|order_status   |order_month|
+--------+---------------------+-------------+---------------+-----------+
|15488   |2013-11-01 00:00:00.0|8987         |PENDING_PAYMENT|2013-11    |
|15489   |2013-11-01 00:00:00.0|5359         |PENDING_PAYMENT|2013-11    |
|15490   |2013-11-01 00:00:00.0|10149        |COMPLETE       |2013-11    |
|15491   |2013-11-01 00:00:00.0|10635        |ON_HOLD        |2013-11    |
|15492   |2013-11-01 00:00:00.0|7784         |PENDING_PAYMENT|2013-11    |
|15493   |2013-11-01 00:00:00.0|1104         |ON_HOLD        |2013-11    |
|15494   |2013-11-01 00:00:00.0|7313         |PROCESSING     |2013-11    |
|15495   |2013-11-01 00:00:00.0|7067         |CLOSED         |2013-11    |
|15496   |2013-11-01 00:00:00.0|12153        |PENDING_PAYMENT|2013-11    |
|15497   |2013-11-01 00:00:00.0|11115        |PENDING_PAYMENT|2013-11    |
|15498   |2013-11-01 00:0

In [79]:
spark.sql("SELECT count(1) FROM orders_partitioned").show()

+--------+
|count(1)|
+--------+
|   68883|
+--------+



# ASSIGNMENT

In [80]:
dataset_path = '../Downloads/Compressed/data-master/nyse_all/nyse_data'
database_name = 'nysedb'
table_name = 'nyse_eod_part'

#partition field: tradeyear of type int 
#insert using dynamic partition mode

In [82]:
spark.sql("SHOW DATABASES").show()

+----------+
| namespace|
+----------+
|   default|
|    nysedb|
|siddhantdb|
+----------+



In [83]:
spark.sql("USE nysedb").show()

++
||
++
++



In [84]:
spark.sql("SELECT current_database()").show()

+------------------+
|current_database()|
+------------------+
|            nysedb|
+------------------+



In [85]:
spark.sql("SHOW TABLES").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|  nysedb| nyse_eod|      false|
+--------+---------+-----------+



In [86]:
create_nyse_part_table =  f""" CREATE TABLE IF NOT EXISTS {table_name}(
    stockticker STRING,
    tradeDate STRING,
    openprice FLOAT,
    highprice FLOAT,
    lowprice FLOAT,
    closeprice FLOAT,
    volume BIGINT
) PARTITIONED BY (tradeyear INT) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE
"""
spark.sql(create_nyse_part_table)

DataFrame[]

In [87]:
spark.sql("SHOW TABLES").show()

+--------+-------------+-----------+
|database|    tableName|isTemporary|
+--------+-------------+-----------+
|  nysedb|     nyse_eod|      false|
|  nysedb|nyse_eod_part|      false|
+--------+-------------+-----------+



In [89]:
spark.sql("SELECT * FROM nyse_eod LIMIT 10").show()

+-----------+---------+---------+---------+--------+----------+------+
|stockticker|tradeDate|openprice|highprice|lowprice|closeprice|volume|
+-----------+---------+---------+---------+--------+----------+------+
|         AA| 19970101|    47.82|    47.82|   47.82|     47.82|     0|
|        ABC| 19970101|     6.03|     6.03|    6.03|      6.03|     0|
|        ABM| 19970101|     9.25|     9.25|    9.25|      9.25|     0|
|        ABT| 19970101|    25.37|    25.37|   25.37|     25.37|     0|
|        ABX| 19970101|    28.75|    28.75|   28.75|     28.75|     0|
|        ACP| 19970101|     9.12|     9.12|    9.12|      9.12|     0|
|        ACV| 19970101|     16.0|     16.0|    16.0|      16.0|     0|
|        ADC| 19970101|    21.37|    21.37|   21.37|     21.37|     0|
|        ADM| 19970101|    17.24|    17.24|   17.24|     17.24|     0|
|        ADX| 19970101|    13.16|    13.16|   13.16|     13.16|     0|
+-----------+---------+---------+---------+--------+----------+------+



In [94]:
spark.sql("SELECT n.*, cast(substr(tradeDate, 1, 4) as int) tradeyear FROM nyse_eod n LIMIT 10").show() #

+-----------+---------+---------+---------+--------+----------+------+---------+
|stockticker|tradeDate|openprice|highprice|lowprice|closeprice|volume|tradeyear|
+-----------+---------+---------+---------+--------+----------+------+---------+
|         AA| 19970101|    47.82|    47.82|   47.82|     47.82|     0|     1997|
|        ABC| 19970101|     6.03|     6.03|    6.03|      6.03|     0|     1997|
|        ABM| 19970101|     9.25|     9.25|    9.25|      9.25|     0|     1997|
|        ABT| 19970101|    25.37|    25.37|   25.37|     25.37|     0|     1997|
|        ABX| 19970101|    28.75|    28.75|   28.75|     28.75|     0|     1997|
|        ACP| 19970101|     9.12|     9.12|    9.12|      9.12|     0|     1997|
|        ACV| 19970101|     16.0|     16.0|    16.0|      16.0|     0|     1997|
|        ADC| 19970101|    21.37|    21.37|   21.37|     21.37|     0|     1997|
|        ADM| 19970101|    17.24|    17.24|   17.24|     17.24|     0|     1997|
|        ADX| 19970101|    1

In [95]:
insert_data_query = "INSERT INTO TABLE nyse_eod_part PARTITION (tradeyear) SELECT n.*, cast(substr(tradeDate, 1, 4) as int) tradeyear FROM nyse_eod n"
spark.sql(insert_data_query)

DataFrame[]

In [96]:
spark.sql("SELECT COUNT(1) FROM nyse_eod").show()

+--------+
|count(1)|
+--------+
| 9384739|
+--------+



In [97]:
spark.sql("SELECT COUNT(1) FROM nyse_eod_part").show()

+--------+
|count(1)|
+--------+
| 9384739|
+--------+



In [None]:
#both have same count and in file manager I checked ... folders are created with names 'tradeyear=1997' up to 'tradeyear=2017'