## Section 16 Basic Transformations using Spark

### 184, 185 Basic Transformations

In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config("spark.driver.memory", '4g'). \
        config('spark.executor.memory', '4g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

#         config('spark.executor.cores', '6'). \
#from pyspark.sql import SparkSession 

#spark = SparkSession.builder.getOrCreate()
#spark.conf.set('spark.executor.cores', '12')
#spark.conf.set("spark.driver.memory", '2g')
#spark.conf.set('spark.executor.memory', '2g')
#spark.conf.set("spark.python.worker.memory", '24g')
#spark.conf.set("spark.sql.shuffle.partitions", '12')
#sc = spark.sparkContext

In [2]:
%%sh

hdfs dfs -ls /public/airlines_all/airlines-part/flightmonth=200801

Found 1 items
-rw-r--r--   2 hdfs supergroup   14654075 2021-01-28 11:28 /public/airlines_all/airlines-part/flightmonth=200801/part-00252-5cde1303-4ebf-4a12-8fad-f5d9f9c9124a.c000.snappy.parquet


In [3]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [4]:
airtraffic = spark.read.parquet(airtraffic_path)

In [5]:
airtraffic.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

In [6]:
airtraffic.select('Year','Month', 'DayOfMonth').distinct().sort('DayOfMonth').show(31,truncate=False)

+----+-----+----------+
|Year|Month|DayOfMonth|
+----+-----+----------+
|2008|1    |1         |
|2008|1    |2         |
|2008|1    |3         |
|2008|1    |4         |
|2008|1    |5         |
|2008|1    |6         |
|2008|1    |7         |
|2008|1    |8         |
|2008|1    |9         |
|2008|1    |10        |
|2008|1    |11        |
|2008|1    |12        |
|2008|1    |13        |
|2008|1    |14        |
|2008|1    |15        |
|2008|1    |16        |
|2008|1    |17        |
|2008|1    |18        |
|2008|1    |19        |
|2008|1    |20        |
|2008|1    |21        |
|2008|1    |22        |
|2008|1    |23        |
|2008|1    |24        |
|2008|1    |25        |
|2008|1    |26        |
|2008|1    |27        |
|2008|1    |28        |
|2008|1    |29        |
|2008|1    |30        |
|2008|1    |31        |
+----+-----+----------+



In [7]:
airtraffic.count()

605659

In [9]:
airtraffic.select('Year','Month', 'DayOfMonth').distinct().count()

31

### Basic filtering of Data or rows using where 

In [10]:
airtraffic.filter("IsArrDelayed = 'YES'").count()

297956

In [11]:
airtraffic.filter("IsArrDelayed = 'YES' and IsDepDelayed='YES'").count()

210965

In [12]:
airtraffic.filter(airtraffic["IsArrDelayed"]=="YES").count()

297956

In [13]:
airtraffic.filter((airtraffic["IsArrDelayed"]=="YES") & (airtraffic["IsDepDelayed"]=="YES")).count()

210965

In [14]:
from pyspark.sql.functions import col

In [15]:
airtraffic.filter(col("IsArrDelayed")=="YES").count()

297956

In [16]:
airtraffic.filter((col("IsArrDelayed")=="YES") & (col("IsDepDelayed")=="YES")).count()

210965

In [17]:
airtraffic. \
    select("Year", "Month", "DayOfMonth","Origin","Dest","IsArrDelayed", "IsDepDelayed"). \
    filter((col("IsArrDelayed")=="YES") & (col("IsDepDelayed")=="YES")). \
    show(5)

+----+-----+----------+------+----+------------+------------+
|Year|Month|DayOfMonth|Origin|Dest|IsArrDelayed|IsDepDelayed|
+----+-----+----------+------+----+------------+------------+
|2008|    1|        17|   SYR| CVG|         YES|         YES|
|2008|    1|        18|   MCI| CVG|         YES|         YES|
|2008|    1|        21|   DCA| JFK|         YES|         YES|
|2008|    1|        22|   ORD| CVG|         YES|         YES|
|2008|    1|         1|   ATL| BNA|         YES|         YES|
+----+-----+----------+------+----+------------+------------+
only showing top 5 rows



In [18]:
airtraffic.select('Cancelled').distinct().show()

+---------+
|Cancelled|
+---------+
|        1|
|        0|
+---------+



In [19]:
airtraffic.filter(col('Cancelled')==1).count()

17293

In [20]:
airtraffic.filter('Cancelled==1').count()

17293

In [21]:
airtraffic.filter(airtraffic['Cancelled']==1).count()

17293

In [22]:
airtraffic.filter(airtraffic.Cancelled==1).count()

17293

In [23]:
airtraffic.filter("Origin == 'SFO'").count()

11573

In [24]:
airtraffic.filter(airtraffic["Origin"] == 'SFO').count()

11573

In [25]:
airtraffic.select('IsDepDelayed').distinct().show()

+------------+
|IsDepDelayed|
+------------+
|         YES|
|          NO|
+------------+



In [28]:
airtraffic.filter("IsDepDelayed = 'NO'").select('Cancelled').distinct().show()

+---------+
|Cancelled|
+---------+
|        0|
+---------+



In [29]:
airtraffic.filter("IsDepDelayed = 'NO'").count()

340461

In [30]:
airtraffic.filter(airtraffic["IsDepDelayed"] == 'NO').count()

340461

In [31]:
airtraffic.filter(col("IsDepDelayed") == 'NO').count()

340461

In [37]:
airtraffic.filter(airtraffic.IsDepDelayed == 'NO').count()

340461

### 187 Filtering Example using dates on Spark DF

In [87]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config("spark.driver.memory", '4g'). \
        config('spark.executor.memory', '4g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

In [39]:
from pyspark.sql.functions import col

In [40]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [41]:
airtraffic = spark.read.parquet(airtraffic_path)

In [42]:
airtraffic.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

In [43]:
airtraffic.select('Year','Month','DayOfMonth').distinct().sort('DayOfMonth').show(31)

+----+-----+----------+
|Year|Month|DayOfMonth|
+----+-----+----------+
|2008|    1|         1|
|2008|    1|         2|
|2008|    1|         3|
|2008|    1|         4|
|2008|    1|         5|
|2008|    1|         6|
|2008|    1|         7|
|2008|    1|         8|
|2008|    1|         9|
|2008|    1|        10|
|2008|    1|        11|
|2008|    1|        12|
|2008|    1|        13|
|2008|    1|        14|
|2008|    1|        15|
|2008|    1|        16|
|2008|    1|        17|
|2008|    1|        18|
|2008|    1|        19|
|2008|    1|        20|
|2008|    1|        21|
|2008|    1|        22|
|2008|    1|        23|
|2008|    1|        24|
|2008|    1|        25|
|2008|    1|        26|
|2008|    1|        27|
|2008|    1|        28|
|2008|    1|        29|
|2008|    1|        30|
|2008|    1|        31|
+----+-----+----------+



In [62]:
airtraffic.select('Year','Month','DayOfMonth').distinct().count()

31

In [45]:
airtraffic.count()

605659

In [46]:
l = [('X',)]

In [47]:
df = spark.createDataFrame(l,'dummy STRING')

In [48]:
df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [53]:
from pyspark.sql.functions import current_date

In [54]:
df.select(current_date()).show()

+--------------+
|current_date()|
+--------------+
|    2024-02-12|
+--------------+



In [55]:
from pyspark.sql.functions import date_format

In [56]:
df.select(current_date().alias("current_date"), date_format(current_date(),'EE').alias('day_name')).show()

+------------+--------+
|current_date|day_name|
+------------+--------+
|  2024-02-12|     Mon|
+------------+--------+



In [57]:
df.select(current_date().alias("current_date"), date_format(current_date(),'EEEE').alias('day_name')).show()

+------------+--------+
|current_date|day_name|
+------------+--------+
|  2024-02-12|  Monday|
+------------+--------+



In [58]:
airtraffic.select('Year','Month','DayOfMonth').show()

+----+-----+----------+
|Year|Month|DayOfMonth|
+----+-----+----------+
|2008|    1|        16|
|2008|    1|        17|
|2008|    1|        17|
|2008|    1|        17|
|2008|    1|        17|
|2008|    1|        18|
|2008|    1|        18|
|2008|    1|        19|
|2008|    1|        20|
|2008|    1|        20|
|2008|    1|        21|
|2008|    1|        21|
|2008|    1|        21|
|2008|    1|        21|
|2008|    1|        21|
|2008|    1|        22|
|2008|    1|        22|
|2008|    1|        23|
|2008|    1|        23|
|2008|    1|        23|
+----+-----+----------+
only showing top 20 rows



In [59]:
from pyspark.sql.functions import col, lpad, concat

In [65]:
airtraffic.select(
    concat(
        col("year"),
        lpad(col("Month"),2,"0"),
        lpad(col("DayOfMonth"),2,"0")
    ).alias('FlightDate')
).show()

+----------+
|FlightDate|
+----------+
|  20080116|
|  20080117|
|  20080117|
|  20080117|
|  20080117|
|  20080118|
|  20080118|
|  20080119|
|  20080120|
|  20080120|
|  20080121|
|  20080121|
|  20080121|
|  20080121|
|  20080121|
|  20080122|
|  20080122|
|  20080123|
|  20080123|
|  20080123|
+----------+
only showing top 20 rows



In [66]:
from pyspark.sql.functions import to_date

In [68]:
airtraffic.select(
        concat(
            col("year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
        ).alias('FlightDate')
    ).\
    select(to_date(col('FlightDate'), 'yyyyMMdd')). \
show()

+-----------------------------+
|to_date(FlightDate, yyyyMMdd)|
+-----------------------------+
|                   2008-01-16|
|                   2008-01-17|
|                   2008-01-17|
|                   2008-01-17|
|                   2008-01-17|
|                   2008-01-18|
|                   2008-01-18|
|                   2008-01-19|
|                   2008-01-20|
|                   2008-01-20|
|                   2008-01-21|
|                   2008-01-21|
|                   2008-01-21|
|                   2008-01-21|
|                   2008-01-21|
|                   2008-01-22|
|                   2008-01-22|
|                   2008-01-23|
|                   2008-01-23|
|                   2008-01-23|
+-----------------------------+
only showing top 20 rows



In [69]:
airtraffic.select(
        concat(
            col("year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
        ).alias('FlightDate')
    ).\
    select(to_date(col('FlightDate'), 'yyyyMMdd').alias('FlightDate')). \
show()

+----------+
|FlightDate|
+----------+
|2008-01-16|
|2008-01-17|
|2008-01-17|
|2008-01-17|
|2008-01-17|
|2008-01-18|
|2008-01-18|
|2008-01-19|
|2008-01-20|
|2008-01-20|
|2008-01-21|
|2008-01-21|
|2008-01-21|
|2008-01-21|
|2008-01-21|
|2008-01-22|
|2008-01-22|
|2008-01-23|
|2008-01-23|
|2008-01-23|
+----------+
only showing top 20 rows



In [72]:
airtraffic.select(
        concat(
            col("year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
        ).alias('FlightDate')
    ).\
    selectExpr("to_date(FlightDate, 'yyyyMMdd') AS FlightDate"). \
show()

+----------+
|FlightDate|
+----------+
|2008-01-16|
|2008-01-17|
|2008-01-17|
|2008-01-17|
|2008-01-17|
|2008-01-18|
|2008-01-18|
|2008-01-19|
|2008-01-20|
|2008-01-20|
|2008-01-21|
|2008-01-21|
|2008-01-21|
|2008-01-21|
|2008-01-21|
|2008-01-22|
|2008-01-22|
|2008-01-23|
|2008-01-23|
|2008-01-23|
+----------+
only showing top 20 rows



In [79]:
airtraffic.select(
        concat(
            col("year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
        ).alias('FlightDate')
    ).\
    selectExpr("date_format(to_date(FlightDate, 'yyyyMMdd'),'EEEE') AS FlightDate"). \
show()

+----------+
|FlightDate|
+----------+
| Wednesday|
|  Thursday|
|  Thursday|
|  Thursday|
|  Thursday|
|    Friday|
|    Friday|
|  Saturday|
|    Sunday|
|    Sunday|
|    Monday|
|    Monday|
|    Monday|
|    Monday|
|    Monday|
|   Tuesday|
|   Tuesday|
| Wednesday|
| Wednesday|
| Wednesday|
+----------+
only showing top 20 rows



In [82]:
airtraffic. \
    withColumn("FlightDate", 
        concat(
            col("year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
            )
        ). \
    filter("""
        date_format(to_date(FlightDate, 'yyyyMMdd'),'EEEE') = 'Sunday'
        """). \
    count()

76395

In [83]:
airtraffic. \
    withColumn("FlightDate", 
        concat(
            col("year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
            )
        ). \
    filter("""
        IsDepDelayed = 'YES' AND 
        date_format(to_date(FlightDate, 'yyyyMMdd'),'EEEE') = 'Sunday'
        """). \
    count()

34708

In [85]:
from pyspark.sql.functions import col,concat, lpad, \
                                    date_format, to_date

In [86]:
airtraffic. \
    withColumn("FlightDate", 
        concat(
            col("year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
            )
        ). \
    filter((col("IsDepDelayed") == 'YES') & 
        (date_format(
            to_date(("FlightDate"), 'yyyyMMdd'),'EEEE')
             == 'Sunday')
        ). \
    count()

34708

### 188 Boolean Operators while filtering from Spark Data Frames

In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config("spark.driver.memory", '4g'). \
        config('spark.executor.memory', '4g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

In [2]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [3]:
from pyspark.sql.functions import col

In [4]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [5]:
airtraffic = spark.read.parquet(airtraffic_path)

In [6]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [7]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    count()

5

In [8]:
airtraffic. \
    filter("IsDepDelayed='YES' AND IsArrDelayed='NO' AND Cancelled=0"). \
    count()

54233

In [9]:
# API style

In [10]:
airtraffic. \
    filter((col("IsDepDelayed")=='YES') &
           (col("IsArrDelayed")=='NO') & 
           (col("Cancelled")==0)
          ). \
    count()

54233

In [11]:
airtraffic. \
    filter((airtraffic.IsDepDelayed=='YES') &
           (airtraffic.IsArrDelayed=='NO') & 
           (airtraffic.Cancelled==0)
          ). \
    count()

54233

In [12]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [13]:
airtraffic. \
    filter("IsDepDelayed='NO' AND ArrDelay>=15"). \
    count()

20705

In [14]:
airtraffic. \
    filter("IsDepDelayed='NO' AND ArrDelay>=15 AND Cancelled=0"). \
    count()

20705

In [15]:
# API style

In [16]:
airtraffic. \
    filter((airtraffic.IsDepDelayed=='NO') &  (airtraffic.ArrDelay>=15)). \
    count()

20705

In [17]:
airtraffic. \
    filter((airtraffic["IsDepDelayed"]=='NO') &  (airtraffic["ArrDelay"]>=15)). \
    count()

20705

In [18]:
from pyspark.sql.functions import concat, lpad, col

In [26]:
airtraffic. \
    withColumn("FlightDate",
        concat(
            col("Year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
        )
    ). \
    show(5)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+----------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|FlightDate|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+--------

In [27]:
l = [('X',)]

In [28]:
df = spark.createDataFrame(l,'dummy STRING')

In [29]:
df

dummy
X


In [30]:
from pyspark.sql.functions import current_date

df.select(current_date()).show()

+--------------+
|current_date()|
+--------------+
|    2024-02-12|
+--------------+



In [31]:
from pyspark.sql.functions import date_format

In [32]:
df.select(current_date(),date_format(current_date(),'EE').alias('day_name')).show()

+--------------+--------+
|current_date()|day_name|
+--------------+--------+
|    2024-02-12|     Mon|
+--------------+--------+



In [33]:
df.select(current_date(),date_format(current_date(),'EEEE').alias('day_name')).show()

+--------------+--------+
|current_date()|day_name|
+--------------+--------+
|    2024-02-12|  Monday|
+--------------+--------+



In [39]:
from pyspark.sql.functions import col, concat, lpad, to_date, date_format, current_date

airtraffic. \
    withColumn("FlightDate",
        concat(
            col("Year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
        )
    ). \
    filter("""
        IsDepDelayed = 'YES' AND Cancelled=0 AND
        (date_format(to_date(FlightDate,'yyyyMMdd'),'EEEE')='Saturday'
            OR date_format(to_date(FlightDate,'yyyyMMdd'),'EEEE')='Sunday'
            )
    """). \
    count()

57873

In [42]:
airtraffic. \
    withColumn("FlightDate",
        concat(
            col("Year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
        )
    ). \
    filter(
        (col("IsDepDelayed") == 'YES') & 
        (col("Cancelled")==0) &
        ((date_format(to_date(col("FlightDate"),'yyyyMMdd'),'EEEE')=='Saturday')
            | (date_format(to_date(col("FlightDate"),'yyyyMMdd'),'EEEE')=='Sunday')
            )
    ). \
    count()

57873

In [47]:
airtraffic. \
    withColumn("FlightDate",
        concat(
            col("Year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
        )
    ). \
    filter(
        (airtraffic.IsDepDelayed == 'YES') & 
        (airtraffic.Cancelled==0) &
        ((date_format(to_date(col("FlightDate"),'yyyyMMdd'),'EEEE')=='Saturday')
            | (date_format(to_date(col("FlightDate"),'yyyyMMdd'),'EEEE')=='Sunday')
            )
    ). \
    count()

57873

In [48]:
### 189 Using IN Operator or isin Function while filtering 

In [58]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config("spark.driver.memory", '4g'). \
        config('spark.executor.memory', '4g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

In [2]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [3]:
from pyspark.sql.functions import col

In [4]:
airtraffic = spark.read.parquet(airtraffic_path)

In [5]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [6]:
airtraffic. \
    filter("Origin IN ('ORD','DFW', 'ATL','LAX','SFO')"). \
    count()

118212

In [7]:
airtraffic.count()

605659

In [8]:
from pyspark.sql.functions import col

In [9]:
l=[('X',)]

In [10]:
df = spark.createDataFrame(l,"dummy STRING")

In [11]:
df

dummy
X


In [12]:
c = col('X')

In [13]:
help(c.isin)

Help on method isin in module pyspark.sql.column:

isin(*cols) method of pyspark.sql.column.Column instance
    A boolean expression that is evaluated to true if the value of this
    expression is contained by the evaluated values of the arguments.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df[df.name.isin("Bob", "Mike")].collect()
    [Row(age=5, name='Bob')]
    >>> df[df.age.isin([1, 2, 3])].collect()
    [Row(age=2, name='Alice')]



In [14]:
airtraffic. \
    filter(col("Origin").isin('ORD','DFW', 'ATL','LAX','SFO')). \
    count()

118212

In [15]:
airtraffic. \
    filter(airtraffic.Origin.isin('ORD','DFW', 'ATL','LAX','SFO')). \
    count()

118212

In [16]:
from pyspark.sql.functions import col,date_format, to_date, concat,lpad

In [17]:
airtraffic. \
    withColumn("FlightDate",
        concat(
            col("Year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
        )
    ). \
    filter(
        """
            IsDepDelayed='YES' AND Cancelled=0 AND
            date_format(to_date(FlightDate,'yyyyMMdd'),'EEEE') IN ('Saturday','Sunday')
        """
    ). \
    count()

57873

In [20]:
airtraffic. \
    withColumn("FlightDate",
        concat(
            col("Year"),
            lpad(col("Month"),2,"0"),
            lpad(col("DayOfMonth"),2,"0")
        )
    ). \
    filter((col("IsDepDelayed")=='YES') &
           (col("Cancelled")==0) &
           (date_format(to_date(col("FlightDate"),'yyyyMMdd'),'EEEE').
            isin('Saturday','Sunday'))
    ). \
    count()

57873

In [21]:
### 190 Using LIKE operator or Like function while filtering from Spark DF

In [22]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config("spark.driver.memory", '4g'). \
        config('spark.executor.memory', '4g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

In [23]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [24]:
from pyspark.sql.functions import col

In [25]:
airtraffic = spark.read.parquet(airtraffic_path)

In [31]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [32]:
employees = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 10,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [33]:
employeesDF = spark. \
    createDataFrame(employees, 
        schema="""employee_id INT, first_name STRING, 
        last_name STRING, salary FLOAT, bonus STRING, nationality STRING,
        phone_number STRING, ssn STRING"""
    )

In [34]:
employeesDF.show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [37]:
employeesDF. \
    filter("first_name LIKE 'Sco%'"). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



In [38]:
employeesDF. \
    filter("UPPER(first_name) LIKE 'SCO%'"). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



In [39]:
employeesDF. \
    filter(col('first_name').like('Sco%')). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



In [60]:
c = col('X')

In [61]:
help(c.like)

Help on method _ in module pyspark.sql.column:

_(other) method of pyspark.sql.column.Column instance
    SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match.
    
    Parameters
    ----------
    other : str
        a SQL LIKE pattern
    
    See Also
    --------
    pyspark.sql.Column.rlike
    
    Examples
    --------
    >>> df.filter(df.name.like('Al%')).collect()
    [Row(age=2, name='Alice')]



In [42]:
from pyspark.sql.functions import upper

employeesDF. \
    filter(upper(col('first_name')).like('SCO%')). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



In [44]:
employeesDF. \
    filter("UPPER(first_name) LIKE '%OT%'"). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



In [45]:
employeesDF. \
    filter(upper(col("first_name")).like("%OT%")). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



In [46]:
employeesDF. \
    filter("UPPER(first_name) NOT LIKE '%OT%'"). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [48]:
employeesDF. \
    filter(upper(col("first_name")).like('%OT%')). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



In [49]:
employeesDF. \
    filter(~(upper(col("first_name")).like('%OT%'))). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [50]:
employeesDF. \
    filter("phone_number NOT LIKE '%44%'"). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|        India|+91 234 567 8901|456 78 9123|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [51]:
employeesDF. \
    filter((col("phone_number").like('%44%'))). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [54]:
employeesDF. \
    filter(~(col("phone_number").like('%44%'))). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|        India|+91 234 567 8901|456 78 9123|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [53]:
## 191 Using Between Operator while filtering from Spark Data

In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config("spark.driver.memory", '4g'). \
        config('spark.executor.memory', '4g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

In [2]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [3]:
from pyspark.sql.functions import col

In [4]:
airtraffic = spark.read.parquet(airtraffic_path)

In [5]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [6]:
spark.sparkContext.getConf().getAll()

[('spark.driver.appUIAddress', 'http://g02.itversity.com:46093'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.sql.repl.eagerEval.enabled', 'true'),
 ('spark.eventLog.dir', 'hdfs:///spark-logs'),
 ('spark.driver.memory', '4g'),
 ('spark.ui.proxyBase', '/proxy/application_1707552082651_2452'),
 ('spark.dynamicAllocation.maxExecutors', '10'),
 ('spark.app.startTime', '1707811962469'),
 ('spark.app.name', 'itv011204 | Python - Basic Transformations'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',
  'http://m02.itversity.com:19088/proxy/application_1707552082651_2452'),
 ('spark.shuffle.io.connectionTimeout', '6000'),
 ('spark.driver.port', '32903'),
 ('spark.app.id', 'application_1707552082651_2452'),
 ('spark.yarn.historyServer.address', 'm02.itversity.com:18080'),
 ('spark.yarn.jars', ''),
 ('spark.history.provider',
  'org.apache.spark.deploy.history.FsHistoryProvider'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.history.fs.l

In [7]:
from pyspark.sql.functions import lpad, concat, col

airtraffic. \
    withColumn("FlightDate",
        concat(col("Year"),
              lpad(col("Month"),2,"0"),
              lpad(col("DayOfMonth"),2,"0"))
              ). \
    show(5)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+----------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|FlightDate|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+--------

In [8]:
airtraffic. \
    withColumn("FlightDate",
        concat(col("Year"),
              lpad(col("Month"),2,"0"),
              lpad(col("DayOfMonth"),2,"0"))
              ). \
    filter("""
        IsDepDelayed = 'YES' AND
        Cancelled = 0 AND
        FlightDate BETWEEN 20080101 AND 20080109
        """). \
    count()

86180

In [9]:
airtraffic. \
    withColumn("FlightDate",
        concat(col("Year"),
              lpad(col("Month"),2,"0"),
              lpad(col("DayOfMonth"),2,"0"))
              ). \
    filter("""
        IsDepDelayed = 'YES' AND
        Cancelled = 0 AND
        FlightDate >= 20080101 AND 
        FlightDate <= 20080109
        """). \
    count()

86180

In [10]:
c = col('X')

In [11]:
help(c.between)

Help on method between in module pyspark.sql.column:

between(lowerBound, upperBound) method of pyspark.sql.column.Column instance
    A boolean expression that is evaluated to true if the value of this
    expression is between the given columns.
    
    .. versionadded:: 1.3.0
    
    Examples
    --------
    >>> df.select(df.name, df.age.between(2, 4)).show()
    +-----+---------------------------+
    | name|((age >= 2) AND (age <= 4))|
    +-----+---------------------------+
    |Alice|                       true|
    |  Bob|                      false|
    +-----+---------------------------+



In [12]:
airtraffic. \
    withColumn("FlightDate",
        concat(col("Year"),
              lpad(col("Month"),2,"0"),
              lpad(col("DayOfMonth"),2,"0"))
              ). \
    filter(
        (col("IsDepDelayed")=='YES') &
        (col("Cancelled")==0) &
        (col("FlightDate").between(20080101, 20080109))
    ). \
    count()

86180

In [13]:
airtraffic. \
    withColumn("FlightDate",
        concat(col("Year"),
              lpad(col("Month"),2,"0"),
              lpad(col("DayOfMonth"),2,"0"))
              ). \
    filter(
        (col("IsDepDelayed")=='YES') &
        (col("Cancelled")==0) &
        (col("FlightDate")>=20080101) &
        (col("FlightDate")<=20080109)
    ). \
    count()

86180

In [14]:
airtraffic. \
    filter((col("ArrDelay").between(15,60))). \
    count()

105319

In [16]:
airtraffic. \
    filter("ArrDelay BETWEEN 15 AND 60"). \
    count()

105319

In [17]:
## 192 Dealing with NULLs while Filtering from Spark DF

In [67]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.network.timeout','6000'). \
        config('spark.executor.heartbeatInterval','20s'). \
        config("spark.driver.memory", '4g'). \
        config('spark.executor.memory', '4g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

In [68]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [69]:
from pyspark.sql.functions import col

In [70]:
airtraffic = spark.read.parquet(airtraffic_path)

In [71]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [72]:
employees = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 10,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [73]:
employeesDF = spark. \
    createDataFrame(employees,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, bonus STRING, nationality STRING,
                    phone_number STRING, ssn STRING"""
                   )

In [74]:
employeesDF.show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [75]:
employeesDF. \
    filter("bonus IS NULL"). \
    show()

+-----------+----------+---------+------+-----+-----------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-----------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|      India|+91 234 567 8901|456 78 9123|
+-----------+----------+---------+------+-----+-----------+----------------+-----------+



In [76]:
employeesDF. \
    filter("bonus IS NOT NULL"). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [77]:
employeesDF. \
    filter("bonus = ''"). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [78]:
employeesDF. \
    filter("bonus <> ''"). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [79]:
employeesDF. \
    filter("bonus != ''"). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [80]:
employeesDF. \
    filter("bonus IS NOT NULL AND bonus != ''"). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [81]:
employeesDF. \
    filter("bonus IS NULL OR BONUS = ''"). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [82]:
employeesDF. \
    filter("!(bonus IS NULL OR BONUS = '')"). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [83]:
from pyspark.sql.functions import col

In [84]:
c=col('X')

In [85]:
c.isNotNull?

[0;31mSignature:[0m [0mc[0m[0;34m.[0m[0misNotNull[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
True if the current expression is NOT null.

Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
>>> df.filter(df.height.isNotNull()).collect()
[Row(name='Tom', height=80)]
[0;31mFile:[0m      /opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/sql/column.py
[0;31mType:[0m      method


In [86]:
employeesDF. \
    filter(col("bonus").isNull()). \
    show()

+-----------+----------+---------+------+-----+-----------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-----------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|      India|+91 234 567 8901|456 78 9123|
+-----------+----------+---------+------+-----+-----------+----------------+-----------+



In [87]:
employeesDF. \
    filter(col("bonus")==''). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [88]:
employeesDF. \
    filter(col("bonus").isNotNull()). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [89]:
employeesDF. \
    filter((col("bonus").isNotNull()) & (col("bonus")!='') ). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [90]:
employeesDF. \
    filter(~((col("bonus").isNotNull()) & (col("bonus")!='')) ). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [91]:
employeesDF. \
    filter("bonus IS NULL"). \
    show()

+-----------+----------+---------+------+-----+-----------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-----------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|      India|+91 234 567 8901|456 78 9123|
+-----------+----------+---------+------+-----+-----------+----------------+-----------+



In [92]:
employeesDF. \
    filter("bonus = ''"). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [93]:
employeesDF. \
    filter("bonus IS NULL OR bonus=''"). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [94]:
employeesDF. \
    filter("nullif(bonus,'') IS NULL"). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [95]:
employeesDF. \
    filter("nullif(bonus,'') IS NOT NULL"). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [96]:
employeesDF. \
    filter(col("bonus").cast('int').isNull()). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [97]:
employeesDF. \
    filter(~(col("bonus").cast('int').isNull())). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [98]:
spark.sparkContext.getConf().getAll()

[('spark.eventLog.enabled', 'true'),
 ('spark.sql.repl.eagerEval.enabled', 'true'),
 ('spark.eventLog.dir', 'hdfs:///spark-logs'),
 ('spark.driver.memory', '4g'),
 ('spark.dynamicAllocation.maxExecutors', '10'),
 ('spark.app.name', 'itv011204 | Python - Basic Transformations'),
 ('spark.shuffle.io.connectionTimeout', '6000'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',
  'http://m02.itversity.com:19088/proxy/application_1707552082651_2466'),
 ('spark.ui.proxyBase', '/proxy/application_1707552082651_2466'),
 ('spark.yarn.historyServer.address', 'm02.itversity.com:18080'),
 ('spark.driver.appUIAddress', 'http://g02.itversity.com:33795'),
 ('spark.yarn.jars', ''),
 ('spark.history.provider',
  'org.apache.spark.deploy.history.FsHistoryProvider'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.history.fs.logDirectory', 'hdfs:///spark-logs'),
 ('spark.submit.deployMode', 'client'),
 ('spark.history.fs.update.interval', '10s'),
 ('spa

In [99]:
## 193 Total Aggregations on Spark Data Frames

In [100]:
## count, sum, avg, min, max

In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config("spark.driver.memory", '4g'). \
        config('spark.executor.memory', '4g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

#        config('spark.network.timeout','6000'). \
#        config('spark.executor.heartbeatInterval','20s'). \


In [2]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [3]:
from pyspark.sql.functions import col

In [4]:
airtraffic = spark.read.parquet(airtraffic_path)

In [5]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [6]:
airtraffic.count()

605659

In [8]:
type(airtraffic.count())

int

In [9]:
from pyspark.sql.functions import count

In [10]:
airtraffic. \
    select(count("*").alias("count")). \
    show()

+------+
| count|
+------+
|605659|
+------+



In [11]:
from pyspark.sql.functions import lit

airtraffic. \
    select(count(lit(1)).alias("count")). \
    show()

+------+
| count|
+------+
|605659|
+------+



In [12]:
airtraffic. \
    select('Year','Month','DayOfMonth'). \
    describe(). \
    show()

+-------+------+------+------------------+
|summary|  Year| Month|        DayOfMonth|
+-------+------+------+------------------+
|  count|605659|605659|            605659|
|   mean|2008.0|   1.0|15.908469947610785|
| stddev|   0.0|   0.0| 8.994294747375292|
|    min|  2008|     1|                 1|
|    max|  2008|     1|                31|
+-------+------+------+------------------+



In [13]:
airtraffic. \
    select('Year','Month','DayOfMonth'). \
    summary(). \
    show()

+-------+------+------+------------------+
|summary|  Year| Month|        DayOfMonth|
+-------+------+------+------------------+
|  count|605659|605659|            605659|
|   mean|2008.0|   1.0|15.908469947610785|
| stddev|   0.0|   0.0| 8.994294747375292|
|    min|  2008|     1|                 1|
|    25%|  2008|     1|                 8|
|    50%|  2008|     1|                16|
|    75%|  2008|     1|                24|
|    max|  2008|     1|                31|
+-------+------+------+------------------+



In [14]:
airtraffic. \
    select('Year', 'Month', 'DayOfMonth'). \
    distinct(). \
    count()

31

In [15]:
from pyspark.sql.functions import countDistinct

In [16]:
airtraffic. \
    select(countDistinct('Year', 'Month', 'DayOfMonth').alias('countDistinct')). \
    show()

+-------------+
|countDistinct|
+-------------+
|           31|
+-------------+



In [17]:
from pyspark.sql.functions import lpad, concat

In [19]:
airtraffic. \
    select(countDistinct(
        concat('Year',
              lpad('Month',2,'0'),
              lpad('DayOfMonth',2,'0')
              )).alias('countDistinct')). \
    show()

+-------------+
|countDistinct|
+-------------+
|           31|
+-------------+



In [20]:
employees = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 10,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [21]:
employeesDF = spark. \
    createDataFrame(employees,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, bonus STRING, nationality STRING,
                    phone_number STRING, ssn STRING"""
                   )

In [22]:
employeesDF.show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [23]:
from pyspark.sql.functions import coalesce, col, sum

In [24]:
employeesDF. \
    select(((sum(coalesce(col('bonus').cast('int'),lit(0)) * col('salary')))/lit(100)).alias('total_bonus')). \
    show()

+-----------+
|total_bonus|
+-----------+
|      250.0|
+-----------+



In [25]:
employeesDF. \
    selectExpr('sum((coalesce(cast(bonus AS INT),0) * salary)/100) AS total_bonus'). \
    show()

+-----------+
|total_bonus|
+-----------+
|      250.0|
+-----------+



In [26]:
order_items = spark.read.json("/public/retail_db_json/order_items")

In [27]:
order_items.show(5)

+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_product_price|order_item_quantity|order_item_subtotal|
+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|            1|                  1|                  957|                  299.98|                  1|             299.98|
|            2|                  2|                 1073|                  199.99|                  1|             199.99|
|            3|                  2|                  502|                    50.0|                  5|              250.0|
|            4|                  2|                  403|                  129.99|                  1|             129.99|
|            5|                  4|                  897|                   24.99|                  2|              49.98|
+-------------+-

In [31]:
order_id = input("Enter order_id")

Enter order_id 2


In [32]:
from pyspark.sql.functions import sum, col

In [33]:
order_items. \
    filter(f'order_item_order_id=={int(order_id)}'). \
    select(sum("order_item_subtotal").alias('order_revenue')). \
    show()

+-------------+
|order_revenue|
+-------------+
|       579.98|
+-------------+



In [34]:
order_items. \
    filter(f'order_item_order_id=={int(order_id)}'). \
    show()

+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_product_price|order_item_quantity|order_item_subtotal|
+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|            2|                  2|                 1073|                  199.99|                  1|             199.99|
|            3|                  2|                  502|                    50.0|                  5|              250.0|
|            4|                  2|                  403|                  129.99|                  1|             129.99|
+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+



In [35]:
order_items. \
    filter(col("order_item_order_id")==lit(int(order_id))). \
    show()

+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_product_price|order_item_quantity|order_item_subtotal|
+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|            2|                  2|                 1073|                  199.99|                  1|             199.99|
|            3|                  2|                  502|                    50.0|                  5|              250.0|
|            4|                  2|                  403|                  129.99|                  1|             129.99|
+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+



In [36]:
order_items. \
    filter(col("order_item_order_id")==lit(int(order_id))). \
    select(sum(col('order_item_subtotal')).alias('order_revenue')). \
    show()

+-------------+
|order_revenue|
+-------------+
|       579.98|
+-------------+



In [5]:
## 194 Aggregate data using groupBy from Spark Data Frames

In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.network.timeout','6000'). \
        config('spark.executor.heartbeatInterval','20s'). \
        config("spark.driver.memory", '4g'). \
        config('spark.executor.memory', '4g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

In [2]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [3]:
from pyspark.sql.functions import col

In [4]:
airtraffic = spark.read.parquet(airtraffic_path)

In [5]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [6]:
from pyspark.sql.functions import concat, lpad

In [7]:
airtraffic. \
    groupBy(
        concat(
            'Year',
            lpad('Month',2,'0'),
            lpad('DayOfMonth',2,'0')
        ).alias('FlightDate')
    ). \
    count(). \
    sort('FlightDate'). \
    show(5)

+----------+-----+
|FlightDate|count|
+----------+-----+
|  20080101|19175|
|  20080102|20953|
|  20080103|20937|
|  20080104|20929|
|  20080105|18066|
+----------+-----+
only showing top 5 rows



In [8]:
from pyspark.sql.functions import count, concat, lpad, lit, sum, avg

In [9]:
airtraffic. \
    groupBy(
        concat(
            'Year',
            lpad('Month',2,'0'),
            lpad('DayOfMonth',2,'0')
        ).alias('FlightDate')
    ). \
    agg(count(lit(1)).alias('FlightCount')). \
    show(5)

+----------+-----------+
|FlightDate|FlightCount|
+----------+-----------+
|  20080120|      18653|
|  20080130|      19766|
|  20080115|      19503|
|  20080118|      20347|
|  20080122|      19504|
+----------+-----------+
only showing top 5 rows



In [10]:
airtraffic. \
    filter('Cancelled=0'). \
    groupBy(
        concat(
            'Year',
            lpad('Month',2,'0'),
            lpad('DayOfMonth',2,'0')
        ).alias('FlightDate')
    ). \
    agg(
        count(lit(1)).alias('FlightCount'),
        sum('DepDelay').alias('TotalDepDelay'),
        avg('DepDelay').alias('AverageDepDelay')
    ). \
    show()

+----------+-----------+-------------+------------------+
|FlightDate|FlightCount|TotalDepDelay|   AverageDepDelay|
+----------+-----------+-------------+------------------+
|  20080120|      18406|     117460.0| 6.381614690861675|
|  20080130|      19072|     129345.0| 6.781931627516778|
|  20080115|      19204|      75096.0|3.9104353259737556|
|  20080118|      20117|     223738.0|11.121837252075359|
|  20080122|      18716|     303796.0| 16.23188715537508|
|  20080104|      20160|     277373.0|13.758581349206349|
|  20080125|      19787|     229850.0|11.616212664880983|
|  20080102|      20442|     452979.0|22.159230995010272|
|  20080105|      17610|     306068.0|17.380352072685973|
|  20080111|      19825|     190918.0|  9.63016393442623|
|  20080109|      19443|      89595.0| 4.608085172041352|
|  20080127|      18265|     365491.0|20.010457158499865|
|  20080101|      18623|     354108.0| 19.01455189819041|
|  20080128|      19493|     220046.0|11.288462525008978|
|  20080119|  

In [11]:
from pyspark.sql.functions import round

In [12]:
airtraffic. \
    filter('Cancelled=0'). \
    groupBy(
        concat(
            'Year',
            lpad('Month',2,'0'),
            lpad('DayOfMonth',2,'0')
        ).alias('FlightDate')
    ). \
    agg(
        count(lit(1)).alias('FlightCount'),
        sum('DepDelay').alias('TotalDepDelay'),
        round(avg('DepDelay'),2).alias('AverageDepDelay')
    ). \
    show()

+----------+-----------+-------------+---------------+
|FlightDate|FlightCount|TotalDepDelay|AverageDepDelay|
+----------+-----------+-------------+---------------+
|  20080120|      18406|     117460.0|           6.38|
|  20080130|      19072|     129345.0|           6.78|
|  20080115|      19204|      75096.0|           3.91|
|  20080118|      20117|     223738.0|          11.12|
|  20080122|      18716|     303796.0|          16.23|
|  20080104|      20160|     277373.0|          13.76|
|  20080125|      19787|     229850.0|          11.62|
|  20080102|      20442|     452979.0|          22.16|
|  20080105|      17610|     306068.0|          17.38|
|  20080111|      19825|     190918.0|           9.63|
|  20080109|      19443|      89595.0|           4.61|
|  20080127|      18265|     365491.0|          20.01|
|  20080101|      18623|     354108.0|          19.01|
|  20080128|      19493|     220046.0|          11.29|
|  20080119|      15373|     155488.0|          10.11|
|  2008010

In [41]:
airtraffic. \
    filter('Cancelled=0'). \
    groupBy(
        concat(
            'Year',
            lpad('Month',2,'0'),
            lpad('DayOfMonth',2,'0')
        ).alias('FlightDate')
    ). \
    agg(
        count(lit(1)).alias('FlightCount'),
        sum('DepDelay').alias('TotalDepDelay'),
        round(avg('DepDelay'),2).alias('AverageDepDelay')
    ). \
    sort('FlightDate'). \
    show()

+----------+-----------+-------------+---------------+
|FlightDate|FlightCount|TotalDepDelay|AverageDepDelay|
+----------+-----------+-------------+---------------+
|  20080101|      18623|     354108.0|          19.01|
|  20080102|      20442|     452979.0|          22.16|
|  20080103|      20462|     329690.0|          16.11|
|  20080104|      20160|     277373.0|          13.76|
|  20080105|      17610|     306068.0|          17.38|
|  20080106|      19210|     323214.0|          16.83|
|  20080107|      19762|     238431.0|          12.07|
|  20080108|      19140|     200670.0|          10.48|
|  20080109|      19443|      89595.0|           4.61|
|  20080110|      19956|     148603.0|           7.45|
|  20080111|      19825|     190918.0|           9.63|
|  20080112|      16346|      24876.0|           1.52|
|  20080113|      18587|     101753.0|           5.47|
|  20080114|      19267|      98261.0|            5.1|
|  20080115|      19204|      75096.0|           3.91|
|  2008011

In [48]:
airtraffic. \
    filter('Cancelled=0'). \
    groupBy(
        concat(
            'Year',
            lpad('Month',2,'0'),
            lpad('DayOfMonth',2,'0')
        ).alias('FlightDate')
    ). \
    agg(
        count(lit(1)).alias('FlightCount'),
        sum('DepDelay').alias('TotalDepDelay'),
        round(avg('DepDelay'),2).alias('AverageDepDelay')
    ). \
    sort(col('FlightDate').desc()). \
    show()

+----------+-----------+-------------+---------------+
|FlightDate|FlightCount|TotalDepDelay|AverageDepDelay|
+----------+-----------+-------------+---------------+
|  20080131|      19179|     396280.0|          20.66|
|  20080130|      19072|     129345.0|           6.78|
|  20080129|      18596|     184855.0|           9.94|
|  20080128|      19493|     220046.0|          11.29|
|  20080127|      18265|     365491.0|          20.01|
|  20080126|      15860|      92129.0|           5.81|
|  20080125|      19787|     229850.0|          11.62|
|  20080124|      19935|     158134.0|           7.93|
|  20080123|      19239|     190807.0|           9.92|
|  20080122|      18716|     303796.0|          16.23|
|  20080121|      19658|     370196.0|          18.83|
|  20080120|      18406|     117460.0|           6.38|
|  20080119|      15373|     155488.0|          10.11|
|  20080118|      20117|     223738.0|          11.12|
|  20080117|      19401|     341271.0|          17.59|
|  2008011

In [13]:
order_items_path = '/public/retail_db_json/order_items'

In [14]:
order_items = spark.read.json(order_items_path)

In [15]:
    order_items.printSchema()

root
 |-- order_item_id: long (nullable = true)
 |-- order_item_order_id: long (nullable = true)
 |-- order_item_product_id: long (nullable = true)
 |-- order_item_product_price: double (nullable = true)
 |-- order_item_quantity: long (nullable = true)
 |-- order_item_subtotal: double (nullable = true)



In [16]:
order_items.count()

172198

In [17]:
order_items. \
    groupBy("order_item_order_id"). \
    sum("order_item_subtotal"). \
    show()

+-------------------+------------------------+
|order_item_order_id|sum(order_item_subtotal)|
+-------------------+------------------------+
|                 29|                 1109.85|
|                474|       774.8199999999999|
|                964|       739.8800000000001|
|               1677|       649.9200000000001|
|               1806|                  789.94|
|               1950|      1015.8700000000001|
|               2214|                  449.96|
|               2250|                  889.94|
|               2453|       999.9300000000001|
|               2509|                  889.94|
|               2529|                   59.99|
|               2927|       999.9100000000001|
|               3091|      469.93000000000006|
|               3764|                   95.98|
|               4590|                  949.83|
|               4894|                  899.94|
|               5385|                  629.86|
|               5409|       699.9200000000001|
|            

In [19]:
order_items. \
    groupBy("order_item_order_id"). \
    agg(sum("order_item_subtotal").alias("revenue_per_order")). \
    show()

+-------------------+------------------+
|order_item_order_id| revenue_per_order|
+-------------------+------------------+
|                 29|           1109.85|
|                474| 774.8199999999999|
|                964| 739.8800000000001|
|               1677| 649.9200000000001|
|               1806|            789.94|
|               1950|1015.8700000000001|
|               2214|            449.96|
|               2250|            889.94|
|               2453| 999.9300000000001|
|               2509|            889.94|
|               2529|             59.99|
|               2927| 999.9100000000001|
|               3091|469.93000000000006|
|               3764|             95.98|
|               4590|            949.83|
|               4894|            899.94|
|               5385|            629.86|
|               5409| 699.9200000000001|
|               6721|            139.99|
|               7225|            774.86|
+-------------------+------------------+
only showing top

In [20]:
order_items. \
    groupBy("order_item_order_id"). \
    agg(round(sum("order_item_subtotal"),2).alias("revenue_per_order")). \
    show()

+-------------------+-----------------+
|order_item_order_id|revenue_per_order|
+-------------------+-----------------+
|                 29|          1109.85|
|                474|           774.82|
|                964|           739.88|
|               1677|           649.92|
|               1806|           789.94|
|               1950|          1015.87|
|               2214|           449.96|
|               2250|           889.94|
|               2453|           999.93|
|               2509|           889.94|
|               2529|            59.99|
|               2927|           999.91|
|               3091|           469.93|
|               3764|            95.98|
|               4590|           949.83|
|               4894|           899.94|
|               5385|           629.86|
|               5409|           699.92|
|               6721|           139.99|
|               7225|           774.86|
+-------------------+-----------------+
only showing top 20 rows



In [21]:
from pyspark.sql.functions import min,max

In [25]:
order_items. \
    groupBy('order_item_order_id'). \
    agg(
        round(sum('order_item_subtotal'),2).alias('revenue_per_order'),
        min('order_item_subtotal').alias('order_item_subtotal_min'),
        max('order_item_subtotal').alias('order_item_subtotal_max')
    ). \
    show()

+-------------------+-----------------+-----------------------+-----------------------+
|order_item_order_id|revenue_per_order|order_item_subtotal_min|order_item_subtotal_max|
+-------------------+-----------------+-----------------------+-----------------------+
|                 29|          1109.85|                 129.99|                 399.98|
|                474|           774.82|                  24.99|                 199.99|
|                964|           739.88|                 129.99|                 299.98|
|               1677|           649.92|                   50.0|                  250.0|
|               1806|           789.94|                  150.0|                 399.98|
|               1950|          1015.87|                  87.96|                 399.98|
|               2214|           449.96|                   50.0|                 399.96|
|               2250|           889.94|                  59.99|                 399.98|
|               2453|           

In [24]:
## 195 Aggregate data using rollup on Spark

In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config("spark.driver.memory", '6g'). \
        config('spark.executor.memory', '6g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

#        config('spark.executor.heartbeatInterval','20s'). \
#         config('spark.network.timeout','6000'). \

In [2]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [3]:
from pyspark.sql.functions import col

In [4]:
airtraffic = spark.read.parquet(airtraffic_path)

In [5]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [6]:
order_items_path = '/public/retail_db_json/order_items'

In [7]:
order_items = spark.read.json(order_items_path)

In [8]:
order_items.count()

172198

In [9]:
order_items.printSchema()

root
 |-- order_item_id: long (nullable = true)
 |-- order_item_order_id: long (nullable = true)
 |-- order_item_product_id: long (nullable = true)
 |-- order_item_product_price: double (nullable = true)
 |-- order_item_quantity: long (nullable = true)
 |-- order_item_subtotal: double (nullable = true)



In [10]:
orders = spark.read.json('/public/retail_db_json/orders')

In [11]:
orders.printSchema()

root
 |-- order_customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [12]:
orders.count()

68883

In [13]:
orders. \
    groupBy('order_date'). \
    count(). \
    show(5)

+--------------------+-----+
|          order_date|count|
+--------------------+-----+
|2013-08-13 00:00:...|   73|
|2013-10-12 00:00:...|  162|
|2013-11-15 00:00:...|  135|
|2014-03-19 00:00:...|  130|
|2014-04-26 00:00:...|  251|
+--------------------+-----+
only showing top 5 rows



In [14]:
from pyspark.sql.functions import count, lit

orders. \
    groupBy('order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    show()

+--------------------+-----------+
|          order_date|order_count|
+--------------------+-----------+
|2013-08-13 00:00:...|         73|
|2013-10-12 00:00:...|        162|
|2013-11-15 00:00:...|        135|
|2014-03-19 00:00:...|        130|
|2014-04-26 00:00:...|        251|
|2013-09-16 00:00:...|        121|
|2013-09-20 00:00:...|        139|
|2013-12-31 00:00:...|        266|
|2013-09-06 00:00:...|        276|
|2014-06-15 00:00:...|        128|
|2013-12-24 00:00:...|        170|
|2014-01-07 00:00:...|        163|
|2014-06-07 00:00:...|        191|
|2013-10-14 00:00:...|        139|
|2013-11-11 00:00:...|        246|
|2014-01-27 00:00:...|        163|
|2014-01-29 00:00:...|        158|
|2014-02-14 00:00:...|        174|
|2014-04-15 00:00:...|        180|
|2014-04-22 00:00:...|        144|
+--------------------+-----------+
only showing top 20 rows



In [15]:
orders. \
    groupBy('order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    count()

364

In [16]:
orders.rollup?

[0;31mSignature:[0m [0morders[0m[0;34m.[0m[0mrollup[0m[0;34m([0m[0;34m*[0m[0mcols[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Create a multi-dimensional rollup for the current :class:`DataFrame` using
the specified columns, so we can run aggregation on them.

.. versionadded:: 1.4.0

Examples
--------
>>> df.rollup("name", df.age).count().orderBy("name", "age").show()
+-----+----+-----+
| name| age|count|
+-----+----+-----+
| null|null|    2|
|Alice|null|    1|
|Alice|   2|    1|
|  Bob|null|    1|
|  Bob|   5|    1|
+-----+----+-----+
[0;31mFile:[0m      /opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/sql/dataframe.py
[0;31mType:[0m      method


In [17]:
orders. \
    rollup('order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_date'). \
    show()

+--------------------+-----------+
|          order_date|order_count|
+--------------------+-----------+
|                null|      68883|
|2013-07-25 00:00:...|        143|
|2013-07-26 00:00:...|        269|
|2013-07-27 00:00:...|        202|
|2013-07-28 00:00:...|        187|
|2013-07-29 00:00:...|        253|
|2013-07-30 00:00:...|        227|
|2013-07-31 00:00:...|        252|
|2013-08-01 00:00:...|        246|
|2013-08-02 00:00:...|        224|
|2013-08-03 00:00:...|        183|
|2013-08-04 00:00:...|        187|
|2013-08-05 00:00:...|        153|
|2013-08-06 00:00:...|        258|
|2013-08-07 00:00:...|        203|
|2013-08-08 00:00:...|        154|
|2013-08-09 00:00:...|        125|
|2013-08-10 00:00:...|        270|
|2013-08-11 00:00:...|        154|
|2013-08-12 00:00:...|        255|
+--------------------+-----------+
only showing top 20 rows



In [18]:
orders. \
    rollup('order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_date'). \
    count()

365

In [19]:
from pyspark.sql.functions import date_format

In [20]:
orders. \
    groupBy(date_format('order_date','yyyyMM').alias('order_month'), 'order_date'). \
    count(). \
    show(5)

+-----------+--------------------+-----+
|order_month|          order_date|count|
+-----------+--------------------+-----+
|     201308|2013-08-18 00:00:...|  199|
|     201402|2014-02-02 00:00:...|  192|
|     201406|2014-06-14 00:00:...|  255|
|     201308|2013-08-19 00:00:...|   93|
|     201403|2014-03-22 00:00:...|  223|
+-----------+--------------------+-----+
only showing top 5 rows



In [21]:
from pyspark.sql.functions import count

In [22]:
orders. \
    groupBy(date_format('order_date','yyyyMM').alias('order_month'), 'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    show(5)

+-----------+--------------------+-----------+
|order_month|          order_date|order_count|
+-----------+--------------------+-----------+
|     201308|2013-08-18 00:00:...|        199|
|     201402|2014-02-02 00:00:...|        192|
|     201406|2014-06-14 00:00:...|        255|
|     201308|2013-08-19 00:00:...|         93|
|     201403|2014-03-22 00:00:...|        223|
+-----------+--------------------+-----------+
only showing top 5 rows



In [23]:
orders. \
    groupBy(date_format('order_date','yyyyMM').alias('order_month'), 'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    count()

364

In [24]:
orders. \
    rollup(
        date_format('order_date','yyyyMM').alias('order_month'),
        'order_date'
    ). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_month','order_date'). \
    show(25)

+-----------+--------------------+-----------+
|order_month|          order_date|order_count|
+-----------+--------------------+-----------+
|       null|                null|      68883|
|     201307|                null|       1533|
|     201307|2013-07-25 00:00:...|        143|
|     201307|2013-07-26 00:00:...|        269|
|     201307|2013-07-27 00:00:...|        202|
|     201307|2013-07-28 00:00:...|        187|
|     201307|2013-07-29 00:00:...|        253|
|     201307|2013-07-30 00:00:...|        227|
|     201307|2013-07-31 00:00:...|        252|
|     201308|                null|       5680|
|     201308|2013-08-01 00:00:...|        246|
|     201308|2013-08-02 00:00:...|        224|
|     201308|2013-08-03 00:00:...|        183|
|     201308|2013-08-04 00:00:...|        187|
|     201308|2013-08-05 00:00:...|        153|
|     201308|2013-08-06 00:00:...|        258|
|     201308|2013-08-07 00:00:...|        203|
|     201308|2013-08-08 00:00:...|        154|
|     201308|

In [25]:
orders. \
    rollup(
        date_format('order_date','yyyyMM').alias('order_month'),
        'order_date'
    ). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_month','order_date'). \
    count()

378

In [26]:
from pyspark.sql.functions import year

In [27]:
orders. \
    groupBy(
        year('order_date').alias('order_year'),
        date_format('order_date','yyyyMM').alias('order_month'),
        'order_date'
    ). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_year','order_month','order_date'). \
    show()

+----------+-----------+--------------------+-----------+
|order_year|order_month|          order_date|order_count|
+----------+-----------+--------------------+-----------+
|      2013|     201307|2013-07-25 00:00:...|        143|
|      2013|     201307|2013-07-26 00:00:...|        269|
|      2013|     201307|2013-07-27 00:00:...|        202|
|      2013|     201307|2013-07-28 00:00:...|        187|
|      2013|     201307|2013-07-29 00:00:...|        253|
|      2013|     201307|2013-07-30 00:00:...|        227|
|      2013|     201307|2013-07-31 00:00:...|        252|
|      2013|     201308|2013-08-01 00:00:...|        246|
|      2013|     201308|2013-08-02 00:00:...|        224|
|      2013|     201308|2013-08-03 00:00:...|        183|
|      2013|     201308|2013-08-04 00:00:...|        187|
|      2013|     201308|2013-08-05 00:00:...|        153|
|      2013|     201308|2013-08-06 00:00:...|        258|
|      2013|     201308|2013-08-07 00:00:...|        203|
|      2013|  

In [28]:
orders. \
    groupBy(
        year('order_date').alias('order_year'),
        date_format('order_date','yyyyMM').alias('order_month'),
        'order_date'
    ). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_year','order_month','order_date'). \
    count()

364

In [29]:
orders. \
    rollup(
        year('order_date').alias('order_year'),
        date_format('order_date','yyyyMM').alias('order_month'),
        'order_date'
    ). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_year','order_month','order_date'). \
    show()

+----------+-----------+--------------------+-----------+
|order_year|order_month|          order_date|order_count|
+----------+-----------+--------------------+-----------+
|      null|       null|                null|      68883|
|      2013|       null|                null|      30662|
|      2013|     201307|                null|       1533|
|      2013|     201307|2013-07-25 00:00:...|        143|
|      2013|     201307|2013-07-26 00:00:...|        269|
|      2013|     201307|2013-07-27 00:00:...|        202|
|      2013|     201307|2013-07-28 00:00:...|        187|
|      2013|     201307|2013-07-29 00:00:...|        253|
|      2013|     201307|2013-07-30 00:00:...|        227|
|      2013|     201307|2013-07-31 00:00:...|        252|
|      2013|     201308|                null|       5680|
|      2013|     201308|2013-08-01 00:00:...|        246|
|      2013|     201308|2013-08-02 00:00:...|        224|
|      2013|     201308|2013-08-03 00:00:...|        183|
|      2013|  

In [30]:
orders. \
    rollup(
        year('order_date').alias('order_year'),
        date_format('order_date','yyyyMM').alias('order_month'),
        'order_date'
    ). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_year','order_month','order_date'). \
    count()

380

In [34]:
orders. \
    rollup(
        year('order_date').alias('order_year'),
        date_format('order_date','yyyyMM').alias('order_month'),
        'order_date'
    ). \
    agg(count(lit(1)).alias('order_count')). \
    filter("order_month=201401").\
    orderBy('order_year','order_month','order_date'). \
    show(32)

+----------+-----------+--------------------+-----------+
|order_year|order_month|          order_date|order_count|
+----------+-----------+--------------------+-----------+
|      2014|     201401|                null|       5908|
|      2014|     201401|2014-01-01 00:00:...|        135|
|      2014|     201401|2014-01-02 00:00:...|        111|
|      2014|     201401|2014-01-03 00:00:...|        250|
|      2014|     201401|2014-01-04 00:00:...|        129|
|      2014|     201401|2014-01-05 00:00:...|        266|
|      2014|     201401|2014-01-06 00:00:...|        155|
|      2014|     201401|2014-01-07 00:00:...|        163|
|      2014|     201401|2014-01-08 00:00:...|        122|
|      2014|     201401|2014-01-09 00:00:...|        207|
|      2014|     201401|2014-01-10 00:00:...|        241|
|      2014|     201401|2014-01-11 00:00:...|        281|
|      2014|     201401|2014-01-12 00:00:...|        215|
|      2014|     201401|2014-01-13 00:00:...|        179|
|      2014|  

In [33]:
orders. \
    rollup(
        year('order_date').alias('order_year'),
        date_format('order_date','yyyyMM').alias('order_month'),
        'order_date'
    ). \
    agg(count(lit(1)).alias('order_count')). \
    filter("order_month=201401").\
    orderBy('order_year','order_month','order_date'). \
    count()

32

In [35]:
## 196 Aggregate data using cube on Spark DFs

In [36]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config("spark.driver.memory", '6g'). \
        config('spark.executor.memory', '6g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

#        config('spark.executor.heartbeatInterval','20s'). \
#         config('spark.network.timeout','6000'). \

In [37]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [38]:
from pyspark.sql.functions import col

In [39]:
airtraffic = spark.read.parquet(airtraffic_path)

In [40]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [41]:
orders = spark.read.json('/public/retail_db_json/orders')

In [44]:
orders.count()

68883

In [45]:
orders.show(5)

+-----------------+--------------------+--------+---------------+
|order_customer_id|          order_date|order_id|   order_status|
+-----------------+--------------------+--------+---------------+
|            11599|2013-07-25 00:00:...|       1|         CLOSED|
|              256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|            12111|2013-07-25 00:00:...|       3|       COMPLETE|
|             8827|2013-07-25 00:00:...|       4|         CLOSED|
|            11318|2013-07-25 00:00:...|       5|       COMPLETE|
+-----------------+--------------------+--------+---------------+
only showing top 5 rows



In [46]:
orders.printSchema()

root
 |-- order_customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [None]:
from pyspark.sql.functions import order

In [47]:
orders. \
    groupBy('order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    show()

+--------------------+-----------+
|          order_date|order_count|
+--------------------+-----------+
|2013-08-13 00:00:...|         73|
|2013-10-12 00:00:...|        162|
|2013-11-15 00:00:...|        135|
|2014-03-19 00:00:...|        130|
|2014-04-26 00:00:...|        251|
|2013-09-16 00:00:...|        121|
|2013-09-20 00:00:...|        139|
|2013-12-31 00:00:...|        266|
|2013-09-06 00:00:...|        276|
|2014-06-15 00:00:...|        128|
|2013-12-24 00:00:...|        170|
|2014-01-07 00:00:...|        163|
|2014-06-07 00:00:...|        191|
|2013-10-14 00:00:...|        139|
|2013-11-11 00:00:...|        246|
|2014-01-27 00:00:...|        163|
|2014-01-29 00:00:...|        158|
|2014-02-14 00:00:...|        174|
|2014-04-15 00:00:...|        180|
|2014-04-22 00:00:...|        144|
+--------------------+-----------+
only showing top 20 rows



In [48]:
orders. \
    groupBy('order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    count()

364

In [50]:
orders.cube?

[0;31mSignature:[0m [0morders[0m[0;34m.[0m[0mcube[0m[0;34m([0m[0;34m*[0m[0mcols[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Create a multi-dimensional cube for the current :class:`DataFrame` using
the specified columns, so we can run aggregations on them.

.. versionadded:: 1.4.0

Examples
--------
>>> df.cube("name", df.age).count().orderBy("name", "age").show()
+-----+----+-----+
| name| age|count|
+-----+----+-----+
| null|null|    2|
| null|   2|    1|
| null|   5|    1|
|Alice|null|    1|
|Alice|   2|    1|
|  Bob|null|    1|
|  Bob|   5|    1|
+-----+----+-----+
[0;31mFile:[0m      /opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/sql/dataframe.py
[0;31mType:[0m      method


In [53]:
orders. \
    cube('order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_date'). \
    show()

+--------------------+-----------+
|          order_date|order_count|
+--------------------+-----------+
|                null|      68883|
|2013-07-25 00:00:...|        143|
|2013-07-26 00:00:...|        269|
|2013-07-27 00:00:...|        202|
|2013-07-28 00:00:...|        187|
|2013-07-29 00:00:...|        253|
|2013-07-30 00:00:...|        227|
|2013-07-31 00:00:...|        252|
|2013-08-01 00:00:...|        246|
|2013-08-02 00:00:...|        224|
|2013-08-03 00:00:...|        183|
|2013-08-04 00:00:...|        187|
|2013-08-05 00:00:...|        153|
|2013-08-06 00:00:...|        258|
|2013-08-07 00:00:...|        203|
|2013-08-08 00:00:...|        154|
|2013-08-09 00:00:...|        125|
|2013-08-10 00:00:...|        270|
|2013-08-11 00:00:...|        154|
|2013-08-12 00:00:...|        255|
+--------------------+-----------+
only showing top 20 rows



In [54]:
orders. \
    cube('order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_date'). \
    count()

365

In [55]:
from pyspark.sql.functions import date_format, count, lit

In [56]:
orders. \
    groupBy(date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    show()

+-----------+--------------------+-----------+
|order_month|          order_date|order_count|
+-----------+--------------------+-----------+
|     201308|2013-08-18 00:00:...|        199|
|     201402|2014-02-02 00:00:...|        192|
|     201406|2014-06-14 00:00:...|        255|
|     201308|2013-08-19 00:00:...|         93|
|     201403|2014-03-22 00:00:...|        223|
|     201405|2014-05-06 00:00:...|        265|
|     201311|2013-11-23 00:00:...|        251|
|     201404|2014-04-19 00:00:...|        116|
|     201401|2014-01-20 00:00:...|        203|
|     201401|2014-01-27 00:00:...|        163|
|     201402|2014-02-18 00:00:...|        219|
|     201406|2014-06-18 00:00:...|        179|
|     201405|2014-05-13 00:00:...|        201|
|     201406|2014-06-17 00:00:...|        142|
|     201308|2013-08-16 00:00:...|        131|
|     201311|2013-11-08 00:00:...|        170|
|     201311|2013-11-19 00:00:...|        188|
|     201311|2013-11-25 00:00:...|        133|
|     201308|

In [57]:
orders. \
    cube(date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_month','order_date'). \
    show()

+-----------+--------------------+-----------+
|order_month|          order_date|order_count|
+-----------+--------------------+-----------+
|       null|                null|      68883|
|       null|2013-07-25 00:00:...|        143|
|       null|2013-07-26 00:00:...|        269|
|       null|2013-07-27 00:00:...|        202|
|       null|2013-07-28 00:00:...|        187|
|       null|2013-07-29 00:00:...|        253|
|       null|2013-07-30 00:00:...|        227|
|       null|2013-07-31 00:00:...|        252|
|       null|2013-08-01 00:00:...|        246|
|       null|2013-08-02 00:00:...|        224|
|       null|2013-08-03 00:00:...|        183|
|       null|2013-08-04 00:00:...|        187|
|       null|2013-08-05 00:00:...|        153|
|       null|2013-08-06 00:00:...|        258|
|       null|2013-08-07 00:00:...|        203|
|       null|2013-08-08 00:00:...|        154|
|       null|2013-08-09 00:00:...|        125|
|       null|2013-08-10 00:00:...|        270|
|       null|

In [58]:
orders. \
    cube(date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_month','order_date'). \
    count()

742

In [64]:
orders. \
    cube(date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    filter('order_month IS NOT NULL'). \
    orderBy('order_month','order_date'). \
    show()

+-----------+--------------------+-----------+
|order_month|          order_date|order_count|
+-----------+--------------------+-----------+
|     201307|                null|       1533|
|     201307|2013-07-25 00:00:...|        143|
|     201307|2013-07-26 00:00:...|        269|
|     201307|2013-07-27 00:00:...|        202|
|     201307|2013-07-28 00:00:...|        187|
|     201307|2013-07-29 00:00:...|        253|
|     201307|2013-07-30 00:00:...|        227|
|     201307|2013-07-31 00:00:...|        252|
|     201308|                null|       5680|
|     201308|2013-08-01 00:00:...|        246|
|     201308|2013-08-02 00:00:...|        224|
|     201308|2013-08-03 00:00:...|        183|
|     201308|2013-08-04 00:00:...|        187|
|     201308|2013-08-05 00:00:...|        153|
|     201308|2013-08-06 00:00:...|        258|
|     201308|2013-08-07 00:00:...|        203|
|     201308|2013-08-08 00:00:...|        154|
|     201308|2013-08-09 00:00:...|        125|
|     201308|

In [63]:
orders. \
    cube(date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    filter('order_month IS NOT NULL'). \
    orderBy('order_month','order_date'). \
    count()

377

In [65]:
from pyspark.sql.functions import date_format, count, lit, year

In [66]:
orders. \
    groupBy(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    show()

+----------+-----------+--------------------+-----------+
|order_year|order_month|          order_date|order_count|
+----------+-----------+--------------------+-----------+
|      2013|     201309|2013-09-13 00:00:...|        103|
|      2014|     201404|2014-04-08 00:00:...|        233|
|      2013|     201312|2013-12-06 00:00:...|        256|
|      2013|     201310|2013-10-13 00:00:...|        277|
|      2013|     201309|2013-09-01 00:00:...|        119|
|      2013|     201310|2013-10-06 00:00:...|        249|
|      2014|     201401|2014-01-14 00:00:...|        209|
|      2014|     201401|2014-01-16 00:00:...|        194|
|      2014|     201403|2014-03-18 00:00:...|        252|
|      2014|     201403|2014-03-31 00:00:...|        263|
|      2014|     201404|2014-04-28 00:00:...|        125|
|      2013|     201309|2013-09-23 00:00:...|        177|
|      2013|     201312|2013-12-23 00:00:...|        154|
|      2014|     201402|2014-02-09 00:00:...|        240|
|      2014|  

In [67]:
orders. \
    groupBy(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    count()

364

In [69]:
orders. \
    rollup(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_year','order_month','order_date'). \
    show()

+----------+-----------+--------------------+-----------+
|order_year|order_month|          order_date|order_count|
+----------+-----------+--------------------+-----------+
|      null|       null|                null|      68883|
|      2013|       null|                null|      30662|
|      2013|     201307|                null|       1533|
|      2013|     201307|2013-07-25 00:00:...|        143|
|      2013|     201307|2013-07-26 00:00:...|        269|
|      2013|     201307|2013-07-27 00:00:...|        202|
|      2013|     201307|2013-07-28 00:00:...|        187|
|      2013|     201307|2013-07-29 00:00:...|        253|
|      2013|     201307|2013-07-30 00:00:...|        227|
|      2013|     201307|2013-07-31 00:00:...|        252|
|      2013|     201308|                null|       5680|
|      2013|     201308|2013-08-01 00:00:...|        246|
|      2013|     201308|2013-08-02 00:00:...|        224|
|      2013|     201308|2013-08-03 00:00:...|        183|
|      2013|  

In [70]:
orders. \
    rollup(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_year','order_month','order_date'). \
    count()

380

In [71]:
orders. \
    cube(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_year','order_month','order_date'). \
    show()

+----------+-----------+--------------------+-----------+
|order_year|order_month|          order_date|order_count|
+----------+-----------+--------------------+-----------+
|      null|       null|                null|      68883|
|      null|       null|2013-07-25 00:00:...|        143|
|      null|       null|2013-07-26 00:00:...|        269|
|      null|       null|2013-07-27 00:00:...|        202|
|      null|       null|2013-07-28 00:00:...|        187|
|      null|       null|2013-07-29 00:00:...|        253|
|      null|       null|2013-07-30 00:00:...|        227|
|      null|       null|2013-07-31 00:00:...|        252|
|      null|       null|2013-08-01 00:00:...|        246|
|      null|       null|2013-08-02 00:00:...|        224|
|      null|       null|2013-08-03 00:00:...|        183|
|      null|       null|2013-08-04 00:00:...|        187|
|      null|       null|2013-08-05 00:00:...|        153|
|      null|       null|2013-08-06 00:00:...|        258|
|      null|  

In [73]:
orders. \
    cube(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_year','order_month','order_date'). \
    count()

1485

In [76]:
orders. \
    cube(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    filter('order_year IS NOT NULL'). \
    orderBy('order_year','order_month','order_date'). \
    show()

+----------+-----------+--------------------+-----------+
|order_year|order_month|          order_date|order_count|
+----------+-----------+--------------------+-----------+
|      2013|       null|                null|      30662|
|      2013|       null|2013-07-25 00:00:...|        143|
|      2013|       null|2013-07-26 00:00:...|        269|
|      2013|       null|2013-07-27 00:00:...|        202|
|      2013|       null|2013-07-28 00:00:...|        187|
|      2013|       null|2013-07-29 00:00:...|        253|
|      2013|       null|2013-07-30 00:00:...|        227|
|      2013|       null|2013-07-31 00:00:...|        252|
|      2013|       null|2013-08-01 00:00:...|        246|
|      2013|       null|2013-08-02 00:00:...|        224|
|      2013|       null|2013-08-03 00:00:...|        183|
|      2013|       null|2013-08-04 00:00:...|        187|
|      2013|       null|2013-08-05 00:00:...|        153|
|      2013|       null|2013-08-06 00:00:...|        258|
|      2013|  

In [79]:
orders. \
    cube(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    filter('order_year IS NOT NULL'). \
    orderBy('order_year','order_month','order_date'). \
    count()

743

In [77]:
orders. \
    cube(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    filter('order_year IS NOT NULL AND order_month IS NOT NULL'). \
    orderBy('order_year','order_month','order_date'). \
    show()

+----------+-----------+--------------------+-----------+
|order_year|order_month|          order_date|order_count|
+----------+-----------+--------------------+-----------+
|      2013|     201307|                null|       1533|
|      2013|     201307|2013-07-25 00:00:...|        143|
|      2013|     201307|2013-07-26 00:00:...|        269|
|      2013|     201307|2013-07-27 00:00:...|        202|
|      2013|     201307|2013-07-28 00:00:...|        187|
|      2013|     201307|2013-07-29 00:00:...|        253|
|      2013|     201307|2013-07-30 00:00:...|        227|
|      2013|     201307|2013-07-31 00:00:...|        252|
|      2013|     201308|                null|       5680|
|      2013|     201308|2013-08-01 00:00:...|        246|
|      2013|     201308|2013-08-02 00:00:...|        224|
|      2013|     201308|2013-08-03 00:00:...|        183|
|      2013|     201308|2013-08-04 00:00:...|        187|
|      2013|     201308|2013-08-05 00:00:...|        153|
|      2013|  

In [81]:
orders. \
    cube(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    filter('order_year IS NOT NULL AND order_month IS NOT NULL'). \
    orderBy('order_year','order_month','order_date'). \
    show()

+----------+-----------+--------------------+-----------+
|order_year|order_month|          order_date|order_count|
+----------+-----------+--------------------+-----------+
|      2013|     201307|                null|       1533|
|      2013|     201307|2013-07-25 00:00:...|        143|
|      2013|     201307|2013-07-26 00:00:...|        269|
|      2013|     201307|2013-07-27 00:00:...|        202|
|      2013|     201307|2013-07-28 00:00:...|        187|
|      2013|     201307|2013-07-29 00:00:...|        253|
|      2013|     201307|2013-07-30 00:00:...|        227|
|      2013|     201307|2013-07-31 00:00:...|        252|
|      2013|     201308|                null|       5680|
|      2013|     201308|2013-08-01 00:00:...|        246|
|      2013|     201308|2013-08-02 00:00:...|        224|
|      2013|     201308|2013-08-03 00:00:...|        183|
|      2013|     201308|2013-08-04 00:00:...|        187|
|      2013|     201308|2013-08-05 00:00:...|        153|
|      2013|  

In [82]:
orders. \
    cube(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    filter('order_year IS NOT NULL AND order_month IS NOT NULL'). \
    orderBy('order_year','order_month','order_date'). \
    count()

377

In [83]:
orders. \
    cube(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    filter("order_date LIKE '2014-01%'"). \
    orderBy('order_year','order_month','order_date'). \
    count()

124

In [84]:
orders. \
    cube(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    filter("order_date LIKE '2014-01%'"). \
    orderBy('order_year','order_month','order_date'). \
    show()

+----------+-----------+--------------------+-----------+
|order_year|order_month|          order_date|order_count|
+----------+-----------+--------------------+-----------+
|      null|       null|2014-01-01 00:00:...|        135|
|      null|       null|2014-01-02 00:00:...|        111|
|      null|       null|2014-01-03 00:00:...|        250|
|      null|       null|2014-01-04 00:00:...|        129|
|      null|       null|2014-01-05 00:00:...|        266|
|      null|       null|2014-01-06 00:00:...|        155|
|      null|       null|2014-01-07 00:00:...|        163|
|      null|       null|2014-01-08 00:00:...|        122|
|      null|       null|2014-01-09 00:00:...|        207|
|      null|       null|2014-01-10 00:00:...|        241|
|      null|       null|2014-01-11 00:00:...|        281|
|      null|       null|2014-01-12 00:00:...|        215|
|      null|       null|2014-01-13 00:00:...|        179|
|      null|       null|2014-01-14 00:00:...|        209|
|      null|  

In [85]:
orders. \
    cube(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    filter("order_year IS NOT NULL AND order_month IS NOT NULL AND order_date LIKE '2014-01%'"). \
    orderBy('order_year','order_month','order_date'). \
    show()

+----------+-----------+--------------------+-----------+
|order_year|order_month|          order_date|order_count|
+----------+-----------+--------------------+-----------+
|      2014|     201401|2014-01-01 00:00:...|        135|
|      2014|     201401|2014-01-02 00:00:...|        111|
|      2014|     201401|2014-01-03 00:00:...|        250|
|      2014|     201401|2014-01-04 00:00:...|        129|
|      2014|     201401|2014-01-05 00:00:...|        266|
|      2014|     201401|2014-01-06 00:00:...|        155|
|      2014|     201401|2014-01-07 00:00:...|        163|
|      2014|     201401|2014-01-08 00:00:...|        122|
|      2014|     201401|2014-01-09 00:00:...|        207|
|      2014|     201401|2014-01-10 00:00:...|        241|
|      2014|     201401|2014-01-11 00:00:...|        281|
|      2014|     201401|2014-01-12 00:00:...|        215|
|      2014|     201401|2014-01-13 00:00:...|        179|
|      2014|     201401|2014-01-14 00:00:...|        209|
|      2014|  

In [86]:
orders. \
    cube(year('order_date').alias('order_year'),date_format('order_date','yyyyMM').alias('order_month'),'order_date'). \
    agg(count(lit(1)).alias('order_count')). \
    filter("order_year IS NOT NULL AND order_month IS NOT NULL AND order_date LIKE '2014-01%'"). \
    orderBy('order_year','order_month','order_date'). \
    count()

31

In [87]:
## 197 Overview of Sorting Spark Data Frames

In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse/'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config("spark.driver.memory", '6g'). \
        config('spark.executor.memory', '6g'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Basic Transformations'). \
        master('yarn'). \
        getOrCreate()

#        config('spark.executor.heartbeatInterval','20s'). \
#         config('spark.network.timeout','6000'). \

In [2]:
airtraffic_path = '/public/airlines_all/airlines-part/flightmonth=200801'

In [3]:
from pyspark.sql.functions import col

In [4]:
airtraffic = spark.read.parquet(airtraffic_path)

In [5]:
airtraffic. \
    select('IsDepDelayed','IsArrDelayed','Cancelled'). \
    distinct(). \
    show()

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [6]:
airtraffic.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

In [9]:
from pyspark.sql.functions import concat, lpad, col,lit, count

In [10]:
airtraffic. \
    filter('cancelled = 1'). \
    groupBy(
        concat(
            col('Year'),
            lpad(col('Month'),2,'0'),
            lpad(col('DayOfMonth'),2,'0')
        ).alias('FlightDate')
    ). \
    agg(count(lit(1)).alias('FlightCount')). \
    show()

+----------+-----------+
|FlightDate|FlightCount|
+----------+-----------+
|  20080120|        247|
|  20080130|        694|
|  20080115|        299|
|  20080118|        230|
|  20080122|        788|
|  20080104|        769|
|  20080125|        526|
|  20080102|        511|
|  20080105|        456|
|  20080111|        524|
|  20080109|        377|
|  20080127|        638|
|  20080101|        552|
|  20080128|        654|
|  20080119|        876|
|  20080106|        683|
|  20080123|        530|
|  20080117|        872|
|  20080116|        532|
|  20080112|        226|
+----------+-----------+
only showing top 20 rows



In [12]:
airtraffic. \
    filter('cancelled = 1'). \
    groupBy(
        concat(
            col('Year'),
            lpad(col('Month'),2,'0'),
            lpad(col('DayOfMonth'),2,'0')
        ).alias('FlightDate')
    ). \
    agg(count(lit(1)).alias('FlightCount')). \
    orderBy('FlightCount'). \
    show(31)

+----------+-----------+
|FlightDate|FlightCount|
+----------+-----------+
|  20080112|        226|
|  20080118|        230|
|  20080120|        247|
|  20080115|        299|
|  20080124|        322|
|  20080110|        341|
|  20080113|        359|
|  20080109|        377|
|  20080126|        416|
|  20080105|        456|
|  20080108|        463|
|  20080103|        475|
|  20080121|        475|
|  20080102|        511|
|  20080111|        524|
|  20080125|        526|
|  20080123|        530|
|  20080116|        532|
|  20080101|        552|
|  20080107|        579|
|  20080127|        638|
|  20080128|        654|
|  20080106|        683|
|  20080130|        694|
|  20080104|        769|
|  20080122|        788|
|  20080117|        872|
|  20080119|        876|
|  20080129|        889|
|  20080114|        909|
|  20080131|       1081|
+----------+-----------+



In [13]:
airtraffic. \
    filter('cancelled = 1'). \
    groupBy(
        concat(
            col('Year'),
            lpad(col('Month'),2,'0'),
            lpad(col('DayOfMonth'),2,'0')
        ).alias('FlightDate')
    ). \
    agg(count(lit(1)).alias('FlightCount')). \
    orderBy(col('FlightCount').asc()). \
    show(31)

+----------+-----------+
|FlightDate|FlightCount|
+----------+-----------+
|  20080112|        226|
|  20080118|        230|
|  20080120|        247|
|  20080115|        299|
|  20080124|        322|
|  20080110|        341|
|  20080113|        359|
|  20080109|        377|
|  20080126|        416|
|  20080105|        456|
|  20080108|        463|
|  20080103|        475|
|  20080121|        475|
|  20080102|        511|
|  20080111|        524|
|  20080125|        526|
|  20080123|        530|
|  20080116|        532|
|  20080101|        552|
|  20080107|        579|
|  20080127|        638|
|  20080128|        654|
|  20080106|        683|
|  20080130|        694|
|  20080104|        769|
|  20080122|        788|
|  20080117|        872|
|  20080119|        876|
|  20080129|        889|
|  20080114|        909|
|  20080131|       1081|
+----------+-----------+



In [14]:
airtraffic. \
    filter('cancelled = 1'). \
    groupBy(
        concat(
            col('Year'),
            lpad(col('Month'),2,'0'),
            lpad(col('DayOfMonth'),2,'0')
        ).alias('FlightDate')
    ). \
    agg(count(lit(1)).alias('FlightCount')). \
    orderBy(col('FlightCount').desc()). \
    show(31)

+----------+-----------+
|FlightDate|FlightCount|
+----------+-----------+
|  20080131|       1081|
|  20080114|        909|
|  20080129|        889|
|  20080119|        876|
|  20080117|        872|
|  20080122|        788|
|  20080104|        769|
|  20080130|        694|
|  20080106|        683|
|  20080128|        654|
|  20080127|        638|
|  20080107|        579|
|  20080101|        552|
|  20080116|        532|
|  20080123|        530|
|  20080125|        526|
|  20080111|        524|
|  20080102|        511|
|  20080121|        475|
|  20080103|        475|
|  20080108|        463|
|  20080105|        456|
|  20080126|        416|
|  20080109|        377|
|  20080113|        359|
|  20080110|        341|
|  20080124|        322|
|  20080115|        299|
|  20080120|        247|
|  20080118|        230|
|  20080112|        226|
+----------+-----------+



In [15]:
airtraffic. \
    select('Year', 'Month', 'DayOfMonth', 'CRSDepTime','Origin'). \
    show()

+----+-----+----------+----------+------+
|Year|Month|DayOfMonth|CRSDepTime|Origin|
+----+-----+----------+----------+------+
|2008|    1|        16|      1735|   BGR|
|2008|    1|        17|      1701|   SYR|
|2008|    1|        17|      1225|   SAV|
|2008|    1|        17|      1530|   CVG|
|2008|    1|        17|      1205|   STL|
|2008|    1|        18|      1150|   STL|
|2008|    1|        18|      1009|   MCI|
|2008|    1|        19|       835|   TUL|
|2008|    1|        20|      1935|   JFK|
|2008|    1|        20|       830|   RDU|
|2008|    1|        21|      1640|   CVG|
|2008|    1|        21|      1204|   MSY|
|2008|    1|        21|      1935|   JFK|
|2008|    1|        21|      1830|   DCA|
|2008|    1|        21|       700|   HSV|
|2008|    1|        22|      1910|   ORD|
|2008|    1|        22|      1320|   CVG|
|2008|    1|        23|       908|   LGA|
|2008|    1|        23|      1252|   CLT|
|2008|    1|        23|       635|   GSP|
+----+-----+----------+----------+

In [16]:
airtraffic. \
    select('Year', 'Month', 'DayOfMonth','CRSDepTime','Origin'). \
    orderBy('Year', 'Month', 'DayOfMonth','CRSDepTime'). \
    show()

+----+-----+----------+----------+------+
|Year|Month|DayOfMonth|CRSDepTime|Origin|
+----+-----+----------+----------+------+
|2008|    1|         1|        10|   LAX|
|2008|    1|         1|        15|   SMF|
|2008|    1|         1|        25|   SMF|
|2008|    1|         1|        25|   PHX|
|2008|    1|         1|        30|   ANC|
|2008|    1|         1|        30|   LAX|
|2008|    1|         1|        30|   LAS|
|2008|    1|         1|        30|   ONT|
|2008|    1|         1|        35|   MCO|
|2008|    1|         1|        35|   SFO|
|2008|    1|         1|        40|   LAX|
|2008|    1|         1|        40|   LAS|
|2008|    1|         1|        40|   LAX|
|2008|    1|         1|        40|   SEA|
|2008|    1|         1|        40|   SFO|
|2008|    1|         1|        40|   SEA|
|2008|    1|         1|        45|   PHX|
|2008|    1|         1|        45|   LAS|
|2008|    1|         1|        50|   ANC|
|2008|    1|         1|        53|   PDX|
+----+-----+----------+----------+

In [17]:
airtraffic. \
    select('Year', 'Month', 'DayOfMonth','CRSDepTime','Origin'). \
    orderBy('Year', 'Month', 'DayOfMonth',col('CRSDepTime').desc()). \
    show()

+----+-----+----------+----------+------+
|Year|Month|DayOfMonth|CRSDepTime|Origin|
+----+-----+----------+----------+------+
|2008|    1|         1|      2359|   PHX|
|2008|    1|         1|      2359|   LAS|
|2008|    1|         1|      2359|   LAS|
|2008|    1|         1|      2359|   PHX|
|2008|    1|         1|      2359|   SLC|
|2008|    1|         1|      2359|   LAS|
|2008|    1|         1|      2359|   LAS|
|2008|    1|         1|      2359|   LAS|
|2008|    1|         1|      2359|   LAS|
|2008|    1|         1|      2359|   LAS|
|2008|    1|         1|      2359|   LAS|
|2008|    1|         1|      2359|   SLC|
|2008|    1|         1|      2359|   SEA|
|2008|    1|         1|      2359|   LAS|
|2008|    1|         1|      2359|   LAS|
|2008|    1|         1|      2359|   TUS|
|2008|    1|         1|      2359|   LAS|
|2008|    1|         1|      2358|   LAS|
|2008|    1|         1|      2358|   LAS|
|2008|    1|         1|      2356|   LAS|
+----+-----+----------+----------+

In [18]:
employees = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 2,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [19]:
employeesDF = spark. \
    createDataFrame(employees,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, bonus STRING, nationality STRING,
                    phone_number STRING, ssn STRING"""
                   )

In [20]:
employeesDF.show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|    2|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [21]:
from pyspark.sql.functions import col, upper,when

In [22]:
when?

[0;31mSignature:[0m [0mwhen[0m[0;34m([0m[0mcondition[0m[0;34m,[0m [0mvalue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Evaluates a list of conditions and returns one of multiple possible result expressions.
If :func:`pyspark.sql.Column.otherwise` is not invoked, None is returned for unmatched
conditions.

.. versionadded:: 1.4.0

Parameters
----------
condition : :class:`~pyspark.sql.Column`
    a boolean :class:`~pyspark.sql.Column` expression.
value :
    a literal value, or a :class:`~pyspark.sql.Column` expression.

>>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
[Row(age=3), Row(age=4)]

>>> df.select(when(df.age == 2, df.age + 1).alias("age")).collect()
[Row(age=3), Row(age=None)]
[0;31mFile:[0m      /opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/sql/functions.py
[0;31mType:[0m      function


In [23]:
employeesDF

employee_id,first_name,last_name,salary,bonus,nationality,phone_number,ssn
1,Scott,Tiger,1000.0,10.0,united states,+1 123 456 7890,123 45 6789
2,Henry,Ford,1250.0,,India,+91 234 567 8901,456 78 9123
3,Nick,Junior,750.0,,united KINGDOM,+44 111 111 1111,222 33 4444
4,Bill,Gomes,1500.0,2.0,AUSTRALIA,+61 987 654 3210,789 12 6118


In [32]:
employeesDF. \
    withColumn('sort_column', when(upper(col('nationality')) == 'UNITED STATES', 0).otherwise(1)). \
    orderBy('sort_column','nationality'). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|sort_column|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|          0|
|          4|      Bill|    Gomes|1500.0|    2|     AUSTRALIA|+61 987 654 3210|789 12 6118|          1|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|          1|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|          1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-----------+



In [27]:
from pyspark.sql.functions import col, upper,when, expr

In [35]:
employeesDF. \
    withColumn(
        'sort_column',
        expr("""CASE WHEN upper(nationality)='UNITED STATES' THEN 0 ELSE 1 END""")). \
    orderBy('sort_column','nationality'). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|sort_column|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|          0|
|          4|      Bill|    Gomes|1500.0|    2|     AUSTRALIA|+61 987 654 3210|789 12 6118|          1|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|          1|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|          1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-----------+



In [36]:
employeesDF.show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|    2|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [37]:
employeesDF. \
    orderBy('bonus'). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|    2|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [39]:
employeesDF. \
    orderBy(col('bonus').cast('int')). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|    2|     AUSTRALIA|+61 987 654 3210|789 12 6118|
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [42]:
employeesDF. \
    withColumn('bonus_cast',col('bonus').cast('int')). \
    orderBy(col('bonus').cast('int')). \ 
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus_cast|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+----------+
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|      null|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|      null|
|          4|      Bill|    Gomes|1500.0|    2|     AUSTRALIA|+61 987 654 3210|789 12 6118|         2|
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|        10|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+----------+



In [43]:
employeesDF. \
    withColumn('bonus_cast',col('bonus').cast('int')). \
    orderBy(col('bonus').cast('int').desc()). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus_cast|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|        10|
|          4|      Bill|    Gomes|1500.0|    2|     AUSTRALIA|+61 987 654 3210|789 12 6118|         2|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|      null|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|      null|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+----------+



In [44]:
c = col('X')

In [45]:
help(c)

Help on Column in module pyspark.sql.column object:

class Column(builtins.object)
 |  A column in a DataFrame.
 |  
 |  :class:`Column` instances can be created by::
 |  
 |      # 1. Select a column out of a DataFrame
 |  
 |      df.colName
 |      df["colName"]
 |  
 |      # 2. Create from an expression
 |      df.colName + 1
 |      1 / df.colName
 |  
 |  .. versionadded:: 1.3.0
 |  
 |  Methods defined here:
 |  
 |  __add__ = _(self, other)
 |      binary operator
 |  
 |  __and__ = _(self, other)
 |      binary operator
 |  
 |  __bool__ = __nonzero__(self)
 |  
 |  __contains__(self, item)
 |      # container operators
 |  
 |  __div__ = _(self, other)
 |      binary operator
 |  
 |  __eq__ = _(self, other)
 |      binary operator
 |  
 |  __ge__ = _(self, other)
 |      binary operator
 |  
 |  __getattr__(self, item)
 |  
 |  __getitem__(self, k)
 |  
 |  __gt__ = _(self, other)
 |      binary operator
 |  
 |  __init__(self, jc)
 |      Initialize self.  See help(type(se

In [46]:
help(c.asc_nulls_last)

Help on method _ in module pyspark.sql.column:

_() method of pyspark.sql.column.Column instance
    Returns a sort expression based on ascending order of the column, and null values
    appear after non-null values.
    
    .. versionadded:: 2.4.0
    
    Examples
    --------
    >>> from pyspark.sql import Row
    >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
    >>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect()
    [Row(name='Alice'), Row(name='Tom'), Row(name=None)]



In [48]:
employeesDF. \
    withColumn('bonus_cast',col('bonus').cast('int')). \
    orderBy(col('bonus').cast('int').asc_nulls_last()). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus_cast|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+----------+
|          4|      Bill|    Gomes|1500.0|    2|     AUSTRALIA|+61 987 654 3210|789 12 6118|         2|
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|        10|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|      null|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|      null|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+----------+

