In [9]:
from pyspark.sql import SparkSession
import os

os.environ["HADOOP_HOME"] = "C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2"
os.environ["HIVE_HOME"] = "C:\\Users\\SkJain\\Documents\\BigDataStackWorkspace\\SparkLearn\\apache-hive-3.1.3-bin"
os.environ["HIVE_LIB"] = "C:\\Users\\SkJain\\Documents\\BigDataStackWorkspace\\SparkLearn\\apache-hive-3.1.3-bin\\lib"
os.environ["HIVE_BIN"] =  "C:\\Users\\SkJain\\Documents\\BigDataStackWorkspace\\SparkLearn\\apache-hive-3.1.3-bin\\bin"
os.environ["HADOOP_USER_CLASSPATH_FIRST"] = "true"

In [18]:
spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName('SparkSql'). \
    master('local'). \
    getOrCreate()

In [22]:
# spark = SparkSession. \
#     builder. \
#     config("spark.ui.port", "0"). \     
#     enableHiveSupport(). \
#     appName('SparkSql'). \
#     master('local'). \
#     getOrCreate()
# # config("spark.sql.warehouse.dir","./spark-warehouse"). \config("spark.sql.catalogImplementation","hive"). \

# Problem Statement
**Get Daily product revenue**
- This is the complete use case and will mostly use all major types of spark sql operations mentioned in spark notes file

## Prepare Tables
- tables needed:
    - orders
    - order_item

In [19]:
spark.sql("SELECT current_database()").show()

+------------------+
|current_database()|
+------------------+
|           default|
+------------------+



In [21]:
spark.sql("CREATE DATABASE IF NOT EXISTS siddhantdb")
spark.sql("SHOW databases").show()

In [None]:
spark.sql("USE siddhantdb")
spark.sql("SELECT current_database()").show()

### Creating Orders Table

In [None]:
spark.sql("DROP TABLE IF EXISTS Orders")

In [None]:
create_order_query = """ CREATE TABLE ORDERS (
    order_id INT,
    order_date STRING,
    order_cust_id INT,
    order_status STRING
    ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
"""

spark.sql(create_order_query)

In [26]:
#load data in the above created table
ordersFilePath = 'datasets/orders/*'
load_data_query = f"LOAD DATA LOCAL INPATH '{ordersFilePath}' INTO TABLE orders"
spark.sql(load_data_query)

In [None]:
spark.sql("SELECT * FROM ORDERS LIMIT 10").show()

In [None]:
spark.sql("SELECT count(1) FROM ORDERS").show()

### Creating Orders Item Table

In [28]:
spark.sql("DROP TABLE IF EXISTS order_items")

In [30]:
create_order_item_query = """ CREATE TABLE order_items (
    order_item_id INT,
    order_item_order_id INT,
    order_item_prod_id INT,
    order_item_quantity INT,
    order_item_subtotal FLOAT,
    order_item_prod_price FLOAT
    ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
"""

spark.sql(create_order_item_query)

In [None]:
orderItemsFilePath = 'datasets/order_items/*'
load_data_query = f"LOAD DATA LOCAL INPATH '{orderItemsFilePath}' INTO TABLE order_items"
spark.sql(load_data_query)

In [None]:
spark.sql("SELECT * FROM order_items LIMIT 10").show()

In [None]:
spark.sql("SELECT count(1) FROM order_items").show()

### Projection
- selecting only the required columns

In [None]:
# to know what all columns are there in the table
spark.sql("DESCRIBE orders").show()

In [None]:
#selecting a subset of columns which we need
spark.sql("SELECT order_cust_id, order_date, order_status FROM order_items").show()

In [None]:
# getting a derived column based on our usecase, here we only need month and year part of order date

query = """ SELECT order_cust_id, 
    date_format(order_date, 'yyyy-MM') as order_month, 
    order_status FROM order_items """
spark.sql(query).show()

In [None]:
# get only distinct values from a column, or genrally eliminate complete same records
spark.sql("SELECT DISTINCT order_status FROM order_items").show()
spark.sql("SELECT DISTINCT * FROM order_items").show()

### Filtering
- selecting only the required data from entire dataset

In [None]:
#get only records where order status has this value
spark.sql("SELECT * FROM order_items WHERE order_status='COMPLETE'").show()

In [None]:
# for checking in multiple order status
spark.sql("SELECT * FROM order_items WHERE order_status IN ('COMPLETE', 'CLOSED')").show()
#or (below approah is used when conditions are on different column. For this usecase we should prefer 'IN'
spark.sql("SELECT * FROM order_items WHERE order_status = 'COMPLETE' OR order_status = 'CLOSED')").show()

In [None]:
#multiple conditions and pattern matching
#getting all records with above mentioned order statuses and placed in Jan-2014
query = """ SELECT * FROM order_items WHERE order_status IN ('COMPLETE', 'CLOSED')
    AND order_date LIKE '2014-01-%'"""
spark.sql("").show()

#derived column in conditions
query = """ SELECT * FROM order_items WHERE order_status IN ('COMPLETE', 'CLOSED')
    AND date_format(order_date, 'yyyy-MM')='2014-01'"""
spark.sql("").show()

# use is null and is not null to check for null values