In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Joining Data Sets'). \
    master('yarn'). \
    getOrCreate()

In [3]:
from pyspark.sql.functions import *

In [4]:
!hdfs dfs -ls /public/airtraffic_all

Found 3 items
drwxr-xr-x   - hdfs supergroup          0 2021-03-02 19:48 /public/airtraffic_all/airport-codes
drwxr-xr-x   - hdfs supergroup          0 2021-03-02 19:42 /public/airtraffic_all/airtraffic
drwxr-xr-x   - hdfs supergroup          0 2021-03-02 19:48 /public/airtraffic_all/airtraffic-part


In [5]:
airportCodesPath = "/public/airtraffic_all/airport-codes"

In [6]:
# This data is tab separated

airportCodes = spark. \
    read. \
    option("sep", "\t"). \
    option("header", True). \
    option("inferSchema", True). \
    csv(airportCodesPath)

In [7]:
airportCodes.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- IATA: string (nullable = true)



In [8]:
airportCodes.show(10)

+-----------+-----+-------+----+
|       City|State|Country|IATA|
+-----------+-----+-------+----+
| Abbotsford|   BC| Canada| YXX|
|   Aberdeen|   SD|    USA| ABR|
|    Abilene|   TX|    USA| ABI|
|      Akron|   OH|    USA| CAK|
|    Alamosa|   CO|    USA| ALS|
|     Albany|   GA|    USA| ABY|
|     Albany|   NY|    USA| ALB|
|Albuquerque|   NM|    USA| ABQ|
| Alexandria|   LA|    USA| AEX|
|  Allentown|   PA|    USA| ABE|
+-----------+-----+-------+----+
only showing top 10 rows



In [9]:
airportCodes.count()

526

In [18]:
# Get the count of unique records and see if it is the same as total count.

airportCodes. \
    select('IATA'). \
    distinct(). \
    count()

524

In [20]:
# Remove the duplicates

duplicateRecords = airportCodes. \
                        groupBy('IATA'). \
                        agg(count(lit(1)).alias("countOfDuplicates")). \
                        filter("countOfDuplicates > 1")

In [21]:
duplicateRecords.show()

+----+-----------------+
|IATA|countOfDuplicates|
+----+-----------------+
| Big|                3|
+----+-----------------+



In [22]:
airportCodes. \
    filter("IATA = 'Big'"). \
    show()

+-----------+------+-------+----+
|       City| State|Country|IATA|
+-----------+------+-------+----+
|       Hilo|    HI|    USA| Big|
|Kailua-Kona|Hawaii|    USA| Big|
|    Kamuela|Hawaii|    USA| Big|
+-----------+------+-------+----+



In [25]:
airportCodes. \
    filter("!(State = 'Hawaii' AND IATA = 'Big')"). \
    show()

+-----------+-----+-------+----+
|       City|State|Country|IATA|
+-----------+-----+-------+----+
| Abbotsford|   BC| Canada| YXX|
|   Aberdeen|   SD|    USA| ABR|
|    Abilene|   TX|    USA| ABI|
|      Akron|   OH|    USA| CAK|
|    Alamosa|   CO|    USA| ALS|
|     Albany|   GA|    USA| ABY|
|     Albany|   NY|    USA| ALB|
|Albuquerque|   NM|    USA| ABQ|
| Alexandria|   LA|    USA| AEX|
|  Allentown|   PA|    USA| ABE|
|   Alliance|   NE|    USA| AIA|
|     Alpena|   MI|    USA| APN|
|    Altoona|   PA|    USA| AOO|
|   Amarillo|   TX|    USA| AMA|
|Anahim Lake|   BC| Canada| YAA|
|  Anchorage|   AK|    USA| ANC|
|   Appleton|   WI|    USA| ATW|
|     Arviat|  NWT| Canada| YEK|
|  Asheville|   NC|    USA| AVL|
|      Aspen|   CO|    USA| ASE|
+-----------+-----+-------+----+
only showing top 20 rows



##### Get number of airports (IATA Codes) for each state in the US. Sort the data in descending order by count.

In [35]:
airportCountByState = airportCodes. \
                            filter("Country = 'USA'"). \
                            filter("!(State = 'Hawaii' AND IATA = 'Big')"). \
                            groupBy('Country', 'State'). \
                            agg(count(lit(1)).alias("IATACount")) . \
                            orderBy(col("IATACount").desc())

In [36]:
airportCountByState.show(10)

+-------+-----+---------+
|Country|State|IATACount|
+-------+-----+---------+
|    USA|   CA|       29|
|    USA|   TX|       26|
|    USA|   AK|       25|
|    USA|   NY|       18|
|    USA|   MI|       18|
|    USA|   FL|       18|
|    USA|   MT|       14|
|    USA|   PA|       13|
|    USA|   CO|       12|
|    USA|   IL|       12|
+-------+-----+---------+
only showing top 10 rows



In [15]:
airtraffic = spark. \
    read. \
    parquet("/public/airtraffic_all/airtraffic-part/flightmonth=200801")

In [16]:
airtraffic.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

In [10]:
orders = spark.read.json('/public/retail_db_json/orders')

In [11]:
orders.printSchema()

root
 |-- order_customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [13]:
orders.show(10, truncate=False)

+-----------------+---------------------+--------+---------------+
|order_customer_id|order_date           |order_id|order_status   |
+-----------------+---------------------+--------+---------------+
|11599            |2013-07-25 00:00:00.0|1       |CLOSED         |
|256              |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|
|12111            |2013-07-25 00:00:00.0|3       |COMPLETE       |
|8827             |2013-07-25 00:00:00.0|4       |CLOSED         |
|11318            |2013-07-25 00:00:00.0|5       |COMPLETE       |
|7130             |2013-07-25 00:00:00.0|6       |COMPLETE       |
|4530             |2013-07-25 00:00:00.0|7       |COMPLETE       |
|2911             |2013-07-25 00:00:00.0|8       |PROCESSING     |
|5657             |2013-07-25 00:00:00.0|9       |PENDING_PAYMENT|
|5648             |2013-07-25 00:00:00.0|10      |PENDING_PAYMENT|
+-----------------+---------------------+--------+---------------+
only showing top 10 rows



In [14]:
orders.count()

68883

In [37]:
order_items = spark.read.json('/public/retail_db_json/order_items')

In [38]:
order_items.printSchema()

root
 |-- order_item_id: long (nullable = true)
 |-- order_item_order_id: long (nullable = true)
 |-- order_item_product_id: long (nullable = true)
 |-- order_item_product_price: double (nullable = true)
 |-- order_item_quantity: long (nullable = true)
 |-- order_item_subtotal: double (nullable = true)



In [39]:
# To avoid 200 shuffle partitions

spark.conf.set("spark.sql.shuffle.partitions", "2")

##### Inner join

In [40]:
joinCondition = orders.order_id == order_items.order_item_order_id

joinType = "inner"

joinedOrders = orders.join(order_items, joinCondition, joinType)

In [41]:
joinedOrders.show(5)

+-----------------+--------------------+--------+---------------+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|order_customer_id|          order_date|order_id|   order_status|order_item_id|order_item_order_id|order_item_product_id|order_item_product_price|order_item_quantity|order_item_subtotal|
+-----------------+--------------------+--------+---------------+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|            11599|2013-07-25 00:00:...|       1|         CLOSED|            1|                  1|                  957|                  299.98|                  1|             299.98|
|              256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|            2|                  2|                 1073|                  199.99|                  1|             199.99|
|              256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|

##### Project all the fields from orders and then order_item_subtotal from order_items.

In [43]:
orders. \
    join(
        order_items, 
        on=orders['order_id'] == order_items['order_item_order_id'],
        how='inner'
    ). \
    select(orders['*'], order_items['order_item_subtotal']). \
    show(10)

+-----------------+--------------------+--------+---------------+-------------------+
|order_customer_id|          order_date|order_id|   order_status|order_item_subtotal|
+-----------------+--------------------+--------+---------------+-------------------+
|            11599|2013-07-25 00:00:...|       1|         CLOSED|             299.98|
|              256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|             199.99|
|              256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|              250.0|
|              256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|             129.99|
|             8827|2013-07-25 00:00:...|       4|         CLOSED|              49.98|
|             8827|2013-07-25 00:00:...|       4|         CLOSED|             299.95|
|             8827|2013-07-25 00:00:...|       4|         CLOSED|              150.0|
|             8827|2013-07-25 00:00:...|       4|         CLOSED|             199.92|
|            11318|2013-07-25 00:00:...|       5|     