In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [3]:
orders_df = spark.read.\
format("csv").\
option("header","true").\
option("inferSchema", "True").\
load("/public/trendytech/orders_wh/*")

In [4]:
orders_df = spark.read.csv("/public/trendytech/orders_wh/*", header = "true", inferSchema = "true")

In [5]:
orders_df.show(4)

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
+--------+--------------------+-----------+---------------+
only showing top 4 rows



In [6]:
orders_df = spark.read.json("/public/trendytech/datasets/orders.json")

In [7]:
orders_df.show(4)

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
+-----------+--------------------+--------+---------------+
only showing top 4 rows



In [8]:
orders_df = spark.read.orc("/public/trendytech/datasets/ordersorc")

In [9]:
orders_df.show(4)

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
+-----------+--------------------+--------+---------------+
only showing top 4 rows



In [10]:
orders_df = spark.read.parquet("/public/trendytech/datasets/ordersparquet")

In [11]:
orders_df.show(4)

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
+-----------+--------------------+--------+---------------+
only showing top 4 rows



In [12]:
filtered_df = orders_df.where("customer_id = 11599")

In [13]:
filtered_df.show(truncate = False)

+-----------+---------------------+--------+------------+
|customer_id|order_date           |order_id|order_status|
+-----------+---------------------+--------+------------+
|11599      |2013-07-25 00:00:00.0|1       |CLOSED      |
|11599      |2013-10-03 00:00:00.0|11397   |COMPLETE    |
|11599      |2013-12-20 00:00:00.0|23908   |COMPLETE    |
|11599      |2014-06-27 00:00:00.0|53545   |PENDING     |
|11599      |2013-10-17 00:00:00.0|59911   |PROCESSING  |
+-----------+---------------------+--------+------------+



In [14]:
orders_df.createOrReplaceTempView("orders")

In [15]:
filtered_df = spark.sql("select * from orders where order_status = 'CLOSED'")

In [16]:
filtered_df.show()

+-----------+--------------------+--------+------------+
|customer_id|          order_date|order_id|order_status|
+-----------+--------------------+--------+------------+
|      11599|2013-07-25 00:00:...|       1|      CLOSED|
|       8827|2013-07-25 00:00:...|       4|      CLOSED|
|       1837|2013-07-25 00:00:...|      12|      CLOSED|
|       1205|2013-07-25 00:00:...|      18|      CLOSED|
|      11441|2013-07-25 00:00:...|      24|      CLOSED|
|       9503|2013-07-25 00:00:...|      25|      CLOSED|
|       5863|2013-07-25 00:00:...|      37|      CLOSED|
|      12271|2013-07-25 00:00:...|      51|      CLOSED|
|       7073|2013-07-25 00:00:...|      57|      CLOSED|
|       4791|2013-07-25 00:00:...|      61|      CLOSED|
|       9111|2013-07-25 00:00:...|      62|      CLOSED|
|       3065|2013-07-25 00:00:...|      87|      CLOSED|
|       9131|2013-07-25 00:00:...|      90|      CLOSED|
|       5116|2013-07-25 00:00:...|     101|      CLOSED|
|       8763|2013-07-26 00:00:.

In [20]:
orders_df.createOrReplaceGlobalTempView("orders")

In [21]:
df1 = spark.sql("select * from orders where order_status = 'CLOSED'")

In [22]:
df1.show()

+-----------+--------------------+--------+------------+
|customer_id|          order_date|order_id|order_status|
+-----------+--------------------+--------+------------+
|      11599|2013-07-25 00:00:...|       1|      CLOSED|
|       8827|2013-07-25 00:00:...|       4|      CLOSED|
|       1837|2013-07-25 00:00:...|      12|      CLOSED|
|       1205|2013-07-25 00:00:...|      18|      CLOSED|
|      11441|2013-07-25 00:00:...|      24|      CLOSED|
|       9503|2013-07-25 00:00:...|      25|      CLOSED|
|       5863|2013-07-25 00:00:...|      37|      CLOSED|
|      12271|2013-07-25 00:00:...|      51|      CLOSED|
|       7073|2013-07-25 00:00:...|      57|      CLOSED|
|       4791|2013-07-25 00:00:...|      61|      CLOSED|
|       9111|2013-07-25 00:00:...|      62|      CLOSED|
|       3065|2013-07-25 00:00:...|      87|      CLOSED|
|       9131|2013-07-25 00:00:...|      90|      CLOSED|
|       5116|2013-07-25 00:00:...|     101|      CLOSED|
|       8763|2013-07-26 00:00:.