In [1]:
import os
import sys
from pyspark.sql import SparkSession

os.environ["HADOOP_HOME"] = "C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2"
os.environ["PYSPARK_PYTHON"] = "python"
sys.path.append('C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2\\bin')

In [2]:
spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName('Pyspark - Part 3'). \
    master('local'). \
    getOrCreate()

In [3]:
from pyspark.sql.functions import *

In [4]:
airtrafic_data_path = 'C:/Users/SkJain/Downloads/Compressed/data-master/airtraffic_all'

In [8]:
airtraffic_df = spark.read.parquet(airtrafic_data_path, headers=True, inferSchema=True)

In [9]:
airtraffic_df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

In [14]:
airtraffic_df.select('year', 'month', 'dayOfMonth').distinct().orderBy('dayOfMonth').show(40)

+----+-----+----------+
|year|month|dayOfMonth|
+----+-----+----------+
|2008|    1|         1|
|2008|    1|         2|
|2008|    1|         3|
|2008|    1|         4|
|2008|    1|         5|
|2008|    1|         6|
|2008|    1|         7|
|2008|    1|         8|
|2008|    1|         9|
|2008|    1|        10|
|2008|    1|        11|
|2008|    1|        12|
|2008|    1|        13|
|2008|    1|        14|
|2008|    1|        15|
|2008|    1|        16|
|2008|    1|        17|
|2008|    1|        18|
|2008|    1|        19|
|2008|    1|        20|
|2008|    1|        21|
|2008|    1|        22|
|2008|    1|        23|
|2008|    1|        24|
|2008|    1|        25|
|2008|    1|        26|
|2008|    1|        27|
|2008|    1|        28|
|2008|    1|        29|
|2008|    1|        30|
|2008|    1|        31|
+----+-----+----------+



In [13]:
airtraffic_df.count()

605659

## Basic Filtering of data
- we can use either filter or where. Both are synonymous and have no difference
- we can pass condition sql style or dataframe style

### Get count of cancelled flights

In [18]:
airtraffic_df.select('Cancelled').distinct().show(5)

+---------+
|Cancelled|
+---------+
|        1|
|        0|
+---------+



In [20]:
# df style 1
airtraffic_df.filter(airtraffic_df['Cancelled'] == 1).count()

17293

In [21]:
# df style 2
airtraffic_df.filter(airtraffic_df.Cancelled == 1).count()

17293

In [22]:
# sql style
airtraffic_df.filter('Cancelled == 1').count()

17293

### Number of flghts scheduled for departure from SFO airport

In [30]:
airtraffic_df.filter(airtraffic_df['Origin'] == 'SFO').count()

11573

### number of flights that are departed from airport without any delay

In [32]:
airtraffic_df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

In [45]:
airtraffic_df.filter("IsDepDelayed = 'NO'").count()

340461

## filtering using dates

### get number of flights departed late on sundays

In [59]:
airtraffic_df. \
select('year', 'month', 'dayOfMonth', 'isDepDelayed'). \
withColumn('fullDate', to_date(concat_ws('-', 'year', 'month', 'dayOfMonth'))). \
filter((date_format('fullDate', 'EEEE') == 'Sunday') & (col('isDepDelayed') == 'YES')). \
count()

34708

## Boolean operators
- each condition should be enclosed in circular brackets

### get count of flights which departed late at origin but reached early at destination

In [60]:
airtraffic_df. \
select('IsDepDelayed', 'IsArrDelayed', 'Cancelled'). \
distinct(). \
show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [62]:
airtraffic_df. \
filter((airtraffic_df['IsDepDelayed'] == 'YES') & 
       (airtraffic_df['IsArrDelayed']=='NO') & 
       (airtraffic_df['Cancelled']==0)). \
count()

54233

### get count of flights which departed early or on time but arrived atleast 15 mins late

In [64]:
airtraffic_df.filter("IsDepDelayed = 'NO' AND ArrDelay>=15").count()

20705

### get number of flights which departed late on sundays as well as saturdays

In [68]:
airtraffic_df. \
select('year', 'month', 'dayOfMonth', 'isDepDelayed'). \
withColumn('fullDate', to_date(concat_ws('-', 'year', 'month', 'dayOfMonth'))). \
filter(
    ((date_format('fullDate', 'EEEE') == 'Sunday') |  
    (date_format('fullDate', 'EEEE') == 'Saturday'))
    & 
    (col('isDepDelayed') == 'YES')
    &
    (col('Cancelled') == 0)
). \
count()

57873

## IN operator

In [69]:
airtraffic_df. \
select('year', 'month', 'dayOfMonth', 'isDepDelayed'). \
withColumn('fullDate', to_date(concat_ws('-', 'year', 'month', 'dayOfMonth'))). \
filter(
    (date_format('fullDate', 'EEEE').isin('Sunday', 'Saturday'))
    & 
    (col('isDepDelayed') == 'YES')
    &
    (col('Cancelled') == 0)
). \
count()

57873

## LIKE operator

In [70]:
employees_with_bonus = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 10,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
            ]

emp_bonus_df = spark.createDataFrame(employees_with_bonus, schema = """ 
    emp_id INT,f_name STRING,l_name STRING,sal FLOAT,
    bonus STRING, country STRING,ph_num STRING,ssn STRING
    """)

In [71]:
emp_bonus_df.show()

+------+------+------+------+-----+--------------+----------------+-----------+
|emp_id|f_name|l_name|   sal|bonus|       country|          ph_num|        ssn|
+------+------+------+------+-----+--------------+----------------+-----------+
|     1| Scott| Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|     2| Henry|  Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|     3|  Nick|Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|     4|  Bill| Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+------+------+------+------+-----+--------------+----------------+-----------+



In [72]:
emp_bonus_df.filter(col('f_name').like('Sco%')).show()

+------+------+------+------+-----+-------------+---------------+-----------+
|emp_id|f_name|l_name|   sal|bonus|      country|         ph_num|        ssn|
+------+------+------+------+-----+-------------+---------------+-----------+
|     1| Scott| Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+------+------+------+------+-----+-------------+---------------+-----------+



In [73]:
# case insensitive
emp_bonus_df.filter(upper(col('f_name')).like('SCO%')).show()

+------+------+------+------+-----+-------------+---------------+-----------+
|emp_id|f_name|l_name|   sal|bonus|      country|         ph_num|        ssn|
+------+------+------+------+-----+-------------+---------------+-----------+
|     1| Scott| Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+------+------+------+------+-----+-------------+---------------+-----------+



In [74]:
# name contains ott
emp_bonus_df.filter(lower(col('f_name')).like('%ott%')).show()

+------+------+------+------+-----+-------------+---------------+-----------+
|emp_id|f_name|l_name|   sal|bonus|      country|         ph_num|        ssn|
+------+------+------+------+-----+-------------+---------------+-----------+
|     1| Scott| Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+------+------+------+------+-----+-------------+---------------+-----------+



In [76]:
# phone number not starting with +44
# ~ is the not operator
emp_bonus_df.filter(~col('f_name').like('+44%')).show()

+------+------+------+------+-----+--------------+----------------+-----------+
|emp_id|f_name|l_name|   sal|bonus|       country|          ph_num|        ssn|
+------+------+------+------+-----+--------------+----------------+-----------+
|     1| Scott| Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|     2| Henry|  Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|     3|  Nick|Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|     4|  Bill| Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+------+------+------+------+-----+--------------+----------------+-----------+



## BETWEEN operator

### count of flights that departed late b/w 1st and 9th Jan 2008

In [82]:
airtraffic_df. \
select('year', 'month', 'dayOfMonth', 'isDepDelayed'). \
withColumn('fullDate', to_date(concat_ws('-', 'year', 'month', 'dayOfMonth'))). \
filter(
    (date_format('fullDate', 'd').between(1, 9)) &
    (col('isDepDelayed') == 'YES') &
    (col('Cancelled') == 0)
). \
count()

86180

In [85]:
emp_bonus_df.filter(~col('bonus').isNull()).show()

+------+------+------+------+-----+--------------+----------------+-----------+
|emp_id|f_name|l_name|   sal|bonus|       country|          ph_num|        ssn|
+------+------+------+------+-----+--------------+----------------+-----------+
|     1| Scott| Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|     3|  Nick|Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|     4|  Bill| Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+------+------+------+------+-----+--------------+----------------+-----------+



In [87]:
emp_bonus_df.filter(col('bonus').isNotNull()).show()

+------+------+------+------+-----+--------------+----------------+-----------+
|emp_id|f_name|l_name|   sal|bonus|       country|          ph_num|        ssn|
+------+------+------+------+-----+--------------+----------------+-----------+
|     1| Scott| Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|     3|  Nick|Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|     4|  Bill| Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+------+------+------+------+-----+--------------+----------------+-----------+



## Aggregations

In [88]:
#count
airtraffic_df.select(count('*')).show()

+--------+
|count(1)|
+--------+
|  605659|
+--------+



In [89]:
#describe
airtraffic_df. \
select('year', 'month', 'dayOfMonth', 'isDepDelayed'). \
describe(). \
show()

+-------+------+------+------------------+------------+
|summary|  year| month|        dayOfMonth|isDepDelayed|
+-------+------+------+------------------+------------+
|  count|605659|605659|            605659|      605659|
|   mean|2008.0|   1.0|15.908469947610785|        null|
| stddev|   0.0|   0.0| 8.994294747375292|        null|
|    min|  2008|     1|                 1|          NO|
|    max|  2008|     1|                31|         YES|
+-------+------+------+------------------+------------+



In [90]:
#summary
airtraffic_df. \
select('year', 'month', 'dayOfMonth', 'isDepDelayed'). \
summary(). \
show()

+-------+------+------+------------------+------------+
|summary|  year| month|        dayOfMonth|isDepDelayed|
+-------+------+------+------------------+------------+
|  count|605659|605659|            605659|      605659|
|   mean|2008.0|   1.0|15.908469947610785|        null|
| stddev|   0.0|   0.0| 8.994294747375292|        null|
|    min|  2008|     1|                 1|          NO|
|    25%|  2008|     1|                 8|        null|
|    50%|  2008|     1|                16|        null|
|    75%|  2008|     1|                24|        null|
|    max|  2008|     1|                31|         YES|
+-------+------+------+------------------+------------+



In [91]:
#count distinct
airtraffic_df. \
select(countDistinct('year', 'month', 'dayOfMonth').alias('dates')). \
show()

+-----+
|dates|
+-----+
|   31|
+-----+



In [97]:
# sum of all bonuses

emp_bonus_df. \
select(sum(coalesce(col('bonus').cast('int'), lit(0)))). \
show()

+------------------------------------+
|sum(coalesce(CAST(bonus AS INT), 0))|
+------------------------------------+
|                                  20|
+------------------------------------+



## Aggregations
- groupby
- rollup
- cube

## Group by

### number of flights scheduled each day

In [104]:
airtraffic_df. \
withColumn('FlightDate', to_date(concat_ws('-', 'year', lpad('month', 2, '0'), lpad('dayOfMonth', 2, '0')))). \
groupBy('FlightDate'). \
agg(count('*').alias('numOfFlights')). \
orderBy(col('numOfFlights').desc()).\
show(32, False)

+----------+------------+
|recordDate|numOfFlights|
+----------+------------+
|2008-01-02|20953       |
|2008-01-03|20937       |
|2008-01-04|20929       |
|2008-01-11|20349       |
|2008-01-18|20347       |
|2008-01-07|20341       |
|2008-01-25|20313       |
|2008-01-10|20297       |
|2008-01-17|20273       |
|2008-01-31|20260       |
|2008-01-24|20257       |
|2008-01-14|20176       |
|2008-01-28|20147       |
|2008-01-21|20133       |
|2008-01-06|19893       |
|2008-01-09|19820       |
|2008-01-23|19769       |
|2008-01-30|19766       |
|2008-01-16|19764       |
|2008-01-08|19603       |
|2008-01-22|19504       |
|2008-01-15|19503       |
|2008-01-29|19485       |
|2008-01-01|19175       |
|2008-01-13|18946       |
|2008-01-27|18903       |
|2008-01-20|18653       |
|2008-01-05|18066       |
|2008-01-12|16572       |
|2008-01-26|16276       |
|2008-01-19|16249       |
+----------+------------+



### count of flights departed, total departure delay and average departure delay for each day

In [109]:
airtraffic_df. \
filter(col('Cancelled') == 0). \
withColumn('FlightDate', to_date(concat_ws('-', 'year', lpad('month', 2, '0'), lpad('dayOfMonth', 2, '0')))). \
groupBy('FlightDate'). \
agg(
    count('*').alias("flightsDeparted"),
    sum('DepDelay').alias("totalDepartureDelay"),
    round(avg('DepDelay'),2).alias("avgDepartureDelay")
).\
show(32)

+----------+---------------+-------------------+-----------------+
|FlightDate|flightsDeparted|totalDepartureDelay|avgDepartureDelay|
+----------+---------------+-------------------+-----------------+
|2008-01-20|          18406|           117460.0|             6.38|
|2008-01-25|          19787|           229850.0|            11.62|
|2008-01-26|          15860|            92129.0|             5.81|
|2008-01-17|          19401|           341271.0|            17.59|
|2008-01-01|          18623|           354108.0|            19.01|
|2008-01-09|          19443|            89595.0|             4.61|
|2008-01-02|          20442|           452979.0|            22.16|
|2008-01-14|          19267|            98261.0|              5.1|
|2008-01-30|          19072|           129345.0|             6.78|
|2008-01-24|          19935|           158134.0|             7.93|
|2008-01-06|          19210|           323214.0|            16.83|
|2008-01-12|          16346|            24876.0|             1

## Rollup

In [110]:
orders_df = spark.read.json('C:/Users/SkJain/Downloads/Compressed/data-master/retail_db_json/orders')

In [113]:
orders_df.show(3, False)

+-----------------+---------------------+--------+---------------+
|order_customer_id|order_date           |order_id|order_status   |
+-----------------+---------------------+--------+---------------+
|11599            |2013-07-25 00:00:00.0|1       |CLOSED         |
|256              |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|
|12111            |2013-07-25 00:00:00.0|3       |COMPLETE       |
+-----------------+---------------------+--------+---------------+
only showing top 3 rows



In [121]:
orders_df .\
groupby('order_date'). \
agg(count('*').alias('order_count')). \
orderBy(col('order_count').desc()). \
show()

+--------------------+-----------+
|          order_date|order_count|
+--------------------+-----------+
|2013-11-03 00:00:...|        347|
|2013-11-24 00:00:...|        292|
|2013-11-14 00:00:...|        287|
|2013-10-04 00:00:...|        287|
|2013-12-26 00:00:...|        286|
|2014-07-20 00:00:...|        285|
|2014-01-11 00:00:...|        281|
|2013-11-05 00:00:...|        278|
|2014-02-01 00:00:...|        278|
|2013-09-25 00:00:...|        277|
|2013-10-13 00:00:...|        277|
|2013-09-14 00:00:...|        276|
|2013-09-06 00:00:...|        276|
|2014-06-19 00:00:...|        276|
|2014-07-15 00:00:...|        274|
|2013-08-10 00:00:...|        270|
|2014-06-09 00:00:...|        269|
|2013-07-26 00:00:...|        269|
|2014-02-19 00:00:...|        268|
|2014-01-05 00:00:...|        266|
+--------------------+-----------+
only showing top 20 rows



In [120]:
orders_df .\
rollup('order_date'). \
agg(count('*').alias('order_count')). \
orderBy(col('order_count').desc()). \
show()

+--------------------+-----------+
|          order_date|order_count|
+--------------------+-----------+
|                null|      68883|
|2013-11-03 00:00:...|        347|
|2013-11-24 00:00:...|        292|
|2013-10-04 00:00:...|        287|
|2013-11-14 00:00:...|        287|
|2013-12-26 00:00:...|        286|
|2014-07-20 00:00:...|        285|
|2014-01-11 00:00:...|        281|
|2014-02-01 00:00:...|        278|
|2013-11-05 00:00:...|        278|
|2013-09-25 00:00:...|        277|
|2013-10-13 00:00:...|        277|
|2013-09-06 00:00:...|        276|
|2014-06-19 00:00:...|        276|
|2013-09-14 00:00:...|        276|
|2014-07-15 00:00:...|        274|
|2013-08-10 00:00:...|        270|
|2013-07-26 00:00:...|        269|
|2014-06-09 00:00:...|        269|
|2014-02-19 00:00:...|        268|
+--------------------+-----------+
only showing top 20 rows



- roll up give 1 additional record where all the days are also added together
- will be more clear with eg of multiple columns

In [122]:
orders_df .\
groupby(date_format('order_date', 'yyyyMM'), 'order_date'). \
agg(count('*').alias('order_count')). \
orderBy(col('order_count').desc()). \
show()

+-------------------------------+--------------------+-----------+
|date_format(order_date, yyyyMM)|          order_date|order_count|
+-------------------------------+--------------------+-----------+
|                         201311|2013-11-03 00:00:...|        347|
|                         201311|2013-11-24 00:00:...|        292|
|                         201311|2013-11-14 00:00:...|        287|
|                         201310|2013-10-04 00:00:...|        287|
|                         201312|2013-12-26 00:00:...|        286|
|                         201407|2014-07-20 00:00:...|        285|
|                         201401|2014-01-11 00:00:...|        281|
|                         201402|2014-02-01 00:00:...|        278|
|                         201311|2013-11-05 00:00:...|        278|
|                         201309|2013-09-25 00:00:...|        277|
|                         201310|2013-10-13 00:00:...|        277|
|                         201309|2013-09-14 00:00:...|        

In [123]:
orders_df .\
rollup(date_format('order_date', 'yyyyMM'), 'order_date'). \
agg(count('*').alias('order_count')). \
orderBy(col('order_count').desc()). \
show()

+-------------------------------+--------------------+-----------+
|date_format(order_date, yyyyMM)|          order_date|order_count|
+-------------------------------+--------------------+-----------+
|                           null|                null|      68883|
|                         201311|                null|       6381|
|                         201401|                null|       5908|
|                         201312|                null|       5892|
|                         201309|                null|       5841|
|                         201403|                null|       5778|
|                         201308|                null|       5680|
|                         201404|                null|       5657|
|                         201402|                null|       5635|
|                         201405|                null|       5467|
|                         201310|                null|       5335|
|                         201406|                null|       5

- in above example
    - there's a grouping till day level, then month level, then entire dataset level

In [124]:
orders_df .\
rollup(year('order_date'), date_format('order_date', 'yyyyMM'), 'order_date'). \
agg(count('*').alias('order_count')). \
orderBy(col('order_count').desc()). \
show()

+----------------+-------------------------------+--------------------+-----------+
|year(order_date)|date_format(order_date, yyyyMM)|          order_date|order_count|
+----------------+-------------------------------+--------------------+-----------+
|            null|                           null|                null|      68883|
|            2014|                           null|                null|      38221|
|            2013|                           null|                null|      30662|
|            2013|                         201311|                null|       6381|
|            2014|                         201401|                null|       5908|
|            2013|                         201312|                null|       5892|
|            2013|                         201309|                null|       5841|
|            2014|                         201403|                null|       5778|
|            2013|                         201308|                null|     

## Cube
- used mostly with BI tools
- if we give three columns then it will give sum for all combinations
    - A null, B and C not null
    - A and B not null, C null (in rollup also)
    - A and C not null, B null
    - A and B null, C not null,
    - A and C null, B not null,
    - B and C null, A not null (in rollup also)
    - A, B and C null (in rollup also)
    - A, B, C not null (in rollup also)
 
- generates double the number of records of rollup for 2 columns
- in rollup it will find for column A also, considering column B (mentioned second in rollup statement) as null
- in cube it will do the above, but also give for column b considering column A as null
- that means if say B is present in A with value C and D then it will add all those records and give the count

## Sorting the data
- we can use orderBy or sort (both are same)

In [126]:
airtraffic_df. \
filter(col('Cancelled') == 0). \
withColumn('FlightDate', to_date(concat_ws('-', 'year', lpad('month', 2, '0'), lpad('dayOfMonth', 2, '0')))). \
groupBy('FlightDate'). \
agg(count('*').alias('flightCount')). \
orderBy(col('flightCount').desc(), col('FlightDate')). \
show()

+----------+-----------+
|FlightDate|flightCount|
+----------+-----------+
|2008-01-03|      20462|
|2008-01-02|      20442|
|2008-01-04|      20160|
|2008-01-18|      20117|
|2008-01-10|      19956|
|2008-01-24|      19935|
|2008-01-11|      19825|
|2008-01-25|      19787|
|2008-01-07|      19762|
|2008-01-21|      19658|
|2008-01-28|      19493|
|2008-01-09|      19443|
|2008-01-17|      19401|
|2008-01-14|      19267|
|2008-01-23|      19239|
|2008-01-16|      19232|
|2008-01-06|      19210|
|2008-01-15|      19204|
|2008-01-31|      19179|
|2008-01-08|      19140|
+----------+-----------+
only showing top 20 rows



In [127]:
emp_bonus_df.show()

+------+------+------+------+-----+--------------+----------------+-----------+
|emp_id|f_name|l_name|   sal|bonus|       country|          ph_num|        ssn|
+------+------+------+------+-----+--------------+----------------+-----------+
|     1| Scott| Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|     2| Henry|  Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|     3|  Nick|Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|     4|  Bill| Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+------+------+------+------+-----+--------------+----------------+-----------+



sort the above data by country but **US should always be on top**

In [129]:
emp_bonus_df. \
withColumn('sortColumn', when((lower(col('country'))=='united states'), 0).otherwise(1)). \
sort('sortColumn', 'country') .\
show()

+------+------+------+------+-----+--------------+----------------+-----------+----------+
|emp_id|f_name|l_name|   sal|bonus|       country|          ph_num|        ssn|sortColumn|
+------+------+------+------+-----+--------------+----------------+-----------+----------+
|     1| Scott| Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|         0|
|     4|  Bill| Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|         1|
|     2| Henry|  Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|         1|
|     3|  Nick|Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|         1|
+------+------+------+------+-----+--------------+----------------+-----------+----------+



In [132]:
employees_with_bonus = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 2,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
            ]

emp_bonus_df = spark.createDataFrame(employees_with_bonus, schema = """ 
    emp_id INT,f_name STRING,l_name STRING,sal FLOAT,
    bonus STRING, country STRING,ph_num STRING,ssn STRING
    """)

In [133]:
# sort by bonus
# null and emptuy string always come at top
# 10 came before 2 because it's sorting as string

emp_bonus_df. \
sort(col('bonus')). \
show()

+------+------+------+------+-----+--------------+----------------+-----------+
|emp_id|f_name|l_name|   sal|bonus|       country|          ph_num|        ssn|
+------+------+------+------+-----+--------------+----------------+-----------+
|     2| Henry|  Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|     3|  Nick|Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|     1| Scott| Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|     4|  Bill| Gomes|1500.0|    2|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+------+------+------+------+-----+--------------+----------------+-----------+



In [136]:
# to get nulls at last

emp_bonus_df. \
sort(col('bonus').cast('int').asc_nulls_last()). \
show()

+------+------+------+------+-----+--------------+----------------+-----------+
|emp_id|f_name|l_name|   sal|bonus|       country|          ph_num|        ssn|
+------+------+------+------+-----+--------------+----------------+-----------+
|     4|  Bill| Gomes|1500.0|    2|     AUSTRALIA|+61 987 654 3210|789 12 6118|
|     1| Scott| Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|     2| Henry|  Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|     3|  Nick|Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+------+------+------+------+-----+--------------+----------------+-----------+



## Problem 1: Total number of flights which are delayed in departure and num of flights delayed in arrival
- o/p column: FlightCount, DepDelayedCount, ArrDelayedCount

In [157]:
airtraffic_df. \
filter((col('Cancelled') == 0)).\
withColumn(
    "depDelayedBoolean",
    when(((upper(col('IsDepDelayed')) == 'YES') & (col('Cancelled') == 0)), 1).otherwise(0)
). \
withColumn(
    "arrDelayedBoolean",
    when(((upper(col('IsArrDelayed')) == 'YES') & (col('Cancelled') == 0)), 1).otherwise(0)
). \
agg(
    count('*').alias('FlightCount'), 
    sum('depDelayedBoolean').alias('DepDelayedCount'), 
    sum('arrDelayedBoolean').alias('ArrDelayedCount')
). \
show()

+-----------+---------------+---------------+
|FlightCount|DepDelayedCount|ArrDelayedCount|
+-----------+---------------+---------------+
|     588366|         247905|         280663|
+-----------+---------------+---------------+



## Problem 2: Number of flights which are delayed in departure and number of flights delayed in arrival for each day along with number of flights departed for each day
- output cols: FlightDate, FlightCount, DepDelayedCount, ArrDelayedCount
- flight date format: YYYY-MM-dd
- sorted in ascending order by flight date

In [165]:
airtraffic_df. \
filter(col('Cancelled') == 0). \
groupby(to_date(
    concat_ws(
        '-',
        'year',
        lpad('month', 2, '0'),
        lpad('dayOfMonth', 2, '0')
    )
).alias('FlightDate')). \
agg(
    count('*').alias('FlightCount'),
    sum(
        when(col('IsDepDelayed') == 'YES', 1). \
        otherwise(0)
    ).alias('DepDelayedCount'),
    sum(
        when(col('IsArrDelayed') == 'YES', 1). \
        otherwise(0)
    ).alias('ArrDelayedCount'),
). \
sort('FlightDate'). \
show()

+----------+-----------+---------------+---------------+
|FlightDate|FlightCount|DepDelayedCount|ArrDelayedCount|
+----------+-----------+---------------+---------------+
|2008-01-01|      18623|          10501|          11173|
|2008-01-02|      20442|          13294|          13749|
|2008-01-03|      20462|          11819|          12013|
|2008-01-04|      20160|           9406|           9824|
|2008-01-05|      17610|           9051|           9345|
|2008-01-06|      19210|          10542|          10705|
|2008-01-07|      19762|           8122|           8683|
|2008-01-08|      19140|           7483|           8938|
|2008-01-09|      19443|           5962|           6857|
|2008-01-10|      19956|           7033|           8565|
|2008-01-11|      19825|           7255|           8164|
|2008-01-12|      16346|           3902|           4078|
|2008-01-13|      18587|           6634|           7473|
|2008-01-14|      19267|           5921|           7104|
|2008-01-15|      19204|       