In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [2]:
orders = spark. \
    read. \
    csv('/public/retail_db/orders',
        schema='''
            order_id INT, 
            order_date STRING, 
            order_customer_id INT, 
            order_status STRING
        '''
       )

In [5]:
orders.show(5, False)

+--------+---------------------+-----------------+---------------+
|order_id|order_date           |order_customer_id|order_status   |
+--------+---------------------+-----------------+---------------+
|1       |2013-07-25 00:00:00.0|11599            |CLOSED         |
|2       |2013-07-25 00:00:00.0|256              |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111            |COMPLETE       |
|4       |2013-07-25 00:00:00.0|8827             |CLOSED         |
|5       |2013-07-25 00:00:00.0|11318            |COMPLETE       |
+--------+---------------------+-----------------+---------------+
only showing top 5 rows



In [6]:
orders.count()

68883

In [7]:
orders. \
    write. \
    parquet(f'/user/{username}/retail_db/orders', 
            mode='overwrite', 
            compression='none'
           )

In [8]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/orders

# File extension should not contain compression algorithms such as snappy.

Found 2 items
-rw-r--r--   3 itv736079 supergroup          0 2021-07-11 01:33 /user/itv736079/retail_db/orders/_SUCCESS
-rw-r--r--   3 itv736079 supergroup     495238 2021-07-11 01:33 /user/itv736079/retail_db/orders/part-00000-ec11ef19-8976-451a-b5d5-288dd46c30ba-c000.parquet


In [9]:
# Alternative approach - using option
orders. \
    write. \
    mode('overwrite'). \
    option('compression', 'none'). \
    parquet(f'/user/{username}/retail_db/orders')

___When you pass options, if there are typos then options will be ignored rather than failing. Be careful and make sure that output is validated.___

In [10]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/orders

# File extension should not contain compression algorithms such as snappy.

Found 2 items
-rw-r--r--   3 itv736079 supergroup          0 2021-07-11 01:34 /user/itv736079/retail_db/orders/_SUCCESS
-rw-r--r--   3 itv736079 supergroup     495238 2021-07-11 01:34 /user/itv736079/retail_db/orders/part-00000-641ef73c-b39a-4f36-90d4-a654447f23a8-c000.parquet


In [11]:
# Alternative approach - using format
orders. \
    write. \
    mode('overwrite'). \
    option('compression', 'none'). \
    format('parquet'). \
    save(f'/user/{username}/retail_db/orders')

In [12]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/orders

# File extension should not contain compression algorithms such as snappy.

Found 2 items
-rw-r--r--   3 itv736079 supergroup          0 2021-07-11 01:35 /user/itv736079/retail_db/orders/_SUCCESS
-rw-r--r--   3 itv736079 supergroup     495238 2021-07-11 01:35 /user/itv736079/retail_db/orders/part-00000-8b1e052b-6294-4063-a1f5-0675c2f31b5b-c000.parquet


* Read order_items data from /public/retail_db_json/order_items and write it to pipe delimited files with gzip compression. Target Location: /user/[YOUR_USER_NAME]/retail_db/order_items. Make sure to validate.

* Ignore the error if the target location already exists. Also make sure to write into only one file. We can use coalesce for it.___

In [13]:
order_items = spark. \
    read. \
    json('/public/retail_db_json/order_items')

In [14]:
order_items.show(5)

+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_product_price|order_item_quantity|order_item_subtotal|
+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|            1|                  1|                  957|                  299.98|                  1|             299.98|
|            2|                  2|                 1073|                  199.99|                  1|             199.99|
|            3|                  2|                  502|                    50.0|                  5|              250.0|
|            4|                  2|                  403|                  129.99|                  1|             129.99|
|            5|                  4|                  897|                   24.99|                  2|              49.98|
+-------------+-

In [15]:
order_items.count()

172198

In [16]:
# Using format
order_items. \
    coalesce(1). \
    write. \
    mode('ignore'). \
    option('compression', 'gzip'). \
    option('sep', '|'). \
    format('csv'). \
    save(f'/user/{username}/retail_db/order_items')

In [17]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/order_items

Found 2 items
-rw-r--r--   3 itv736079 supergroup          0 2021-07-11 01:38 /user/itv736079/retail_db/order_items/_SUCCESS
-rw-r--r--   3 itv736079 supergroup    1032820 2021-07-11 01:38 /user/itv736079/retail_db/order_items/part-00000-3e25aefd-d353-4fbe-9195-1e48d601e6c3-c000.csv.gz


In [18]:
# Alternative approach - using keyword arguments
order_items. \
    coalesce(1). \
    write. \
    csv(f'/user/{username}/retail_db/order_items',
        sep='|',
        mode='overwrite',
        compression='gzip'
       )

In [19]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/order_items

Found 2 items
-rw-r--r--   3 itv736079 supergroup          0 2021-07-11 01:38 /user/itv736079/retail_db/order_items/_SUCCESS
-rw-r--r--   3 itv736079 supergroup    1032820 2021-07-11 01:38 /user/itv736079/retail_db/order_items/part-00000-54224c69-3b7e-4964-9710-3deeb474d2d6-c000.csv.gz
