In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Basic Transformations'). \
    master('yarn'). \
    getOrCreate()

In [2]:
from pyspark.sql.functions import *

##### Read the data for the month of 2008 January. 

In [3]:
%%sh

hdfs dfs -ls /public/airtraffic_all/airtraffic-part/flightmonth=200801

Found 1 items
-rw-r--r--   3 hdfs supergroup   14654075 2021-03-02 19:47 /public/airtraffic_all/airtraffic-part/flightmonth=200801/part-00252-5cde1303-4ebf-4a12-8fad-f5d9f9c9124a.c000.snappy.parquet


In [4]:
airtraffic_path = "/public/airtraffic_all/airtraffic-part/flightmonth=200801"

In [5]:
airtraffic = spark. \
    read. \
    parquet(airtraffic_path)

In [6]:
airtraffic.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

In [7]:
airtraffic.count()

605659

In [8]:
airtraffic.select('Year', 'Month', 'DayOfMonth').distinct().show()

+----+-----+----------+
|Year|Month|DayOfMonth|
+----+-----+----------+
|2008|    1|        28|
|2008|    1|        25|
|2008|    1|        20|
|2008|    1|        11|
|2008|    1|         4|
|2008|    1|         5|
|2008|    1|        15|
|2008|    1|         3|
|2008|    1|        16|
|2008|    1|         9|
|2008|    1|        17|
|2008|    1|        19|
|2008|    1|        12|
|2008|    1|         6|
|2008|    1|        21|
|2008|    1|        18|
|2008|    1|         7|
|2008|    1|         1|
|2008|    1|        26|
|2008|    1|        24|
+----+-----+----------+
only showing top 20 rows



In [9]:
# Validate whether dataset has records for all 31 days of 2008 Jan

airtraffic.select('Year', 'Month', 'DayOfMonth').distinct().count()

31

##### Filtering data

* Filtering can be done either by using `filter` or `where`. These are like __synonyms__ to each other.

* When it comes to the condition, we can either pass it in SQL Style or Data Frame Style.

* Example for SQL Style - `airtraffic.filter("IsArrDelayed = 'YES'").show()` or `airtraffic.where("IsArrDelayed = 'YES'").show()`

* Example for Data Frame Style - `airtraffic.filter(airtraffic["IsArrDelayed"] == 'YES').show()` or `airtraffic.filter(airtraffic.IsArrDelayed == 'YES').show()`. We can also use where instead of filter.

* Here are the other operations we can perform to filter the data - `!=, >, <, >=, <=, LIKE, BETWEEN AND, isNull` etc.

##### Get the count of cancelled flights

In [10]:
# different possible values for Cancelled column
airtraffic.select('Cancelled').distinct().show()

+---------+
|Cancelled|
+---------+
|        1|
|        0|
+---------+



In [11]:
airtraffic.filter('Cancelled = 1').count()

17293

In [12]:
airtraffic.filter(airtraffic['Cancelled'] == 1).count()

17293

In [13]:
airtraffic.filter(col('Cancelled') == 1).count()

17293

##### Get number of flights scheduled for departure from SFO airport.

In [14]:
airtraffic.filter('Origin = "SFO"').count()

11573

##### Get number of flights that have departed from airport with out any delay.

In [15]:
airtraffic.select('IsDepDelayed').distinct()

IsDepDelayed
YES
NO


In [16]:
# Check if any of the delayed flights were cancelled later

airtraffic.filter('IsDepDelayed = "NO"').filter('Cancelled = 1').count()

0

In [17]:
airtraffic.filter('IsDepDelayed = "NO"').count()

340461

##### Get number of flights departed late on Sundays.

As if the dataset we donot have a column which identifies day of the week, we need to find that first

In [18]:
# create the dummy dataframe

l = [('X', )]

df = spark.createDataFrame(l, "dummy STRING")
df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [19]:
df.select(current_date()).show()

+--------------+
|current_date()|
+--------------+
|    2021-06-23|
+--------------+



In [35]:
df.select(current_date(), date_format(current_date(), 'EEEE').alias('day_name')).show()

+--------------+---------+
|current_date()| day_name|
+--------------+---------+
|    2021-06-23|Wednesday|
+--------------+---------+



In [36]:
airtraffic.select('Year', 'Month', 'DayofMonth','DayOfWeek').show(5)

+----+-----+----------+---------+
|Year|Month|DayofMonth|DayOfWeek|
+----+-----+----------+---------+
|2008|    1|        16|        3|
|2008|    1|        17|        4|
|2008|    1|        17|        4|
|2008|    1|        17|        4|
|2008|    1|        17|        4|
+----+-----+----------+---------+
only showing top 5 rows



In [37]:
airtraffic.select(date_format(to_date(concat('Year', 'Month', 'DayofMonth'), 'yyyyMMdd'), 'EEEE').alias('day_name')).show(5)

+--------+
|day_name|
+--------+
|Thursday|
|  Friday|
|  Friday|
|  Friday|
|  Friday|
+--------+
only showing top 5 rows



In [39]:
# As the Month and DayofMonth columns might have single digit data, let's make it uniform
# If the length of the string is less than 2, lpad 0

airtraffic.select(
    concat(
        col("Year"),
        lpad(col("Month"), 2, "0"),
        lpad(col("DayOfMonth"), 2, "0")
    ).alias('FlightDate')
).show(5)

+----------+
|FlightDate|
+----------+
|  20080116|
|  20080117|
|  20080117|
|  20080117|
|  20080117|
+----------+
only showing top 5 rows



In [43]:
airtraffic. \
    select(
        concat(
            col("Year"),
            lpad(col("Month"), 2, "0"),
            lpad(col("DayOfMonth"), 2, "0")
        ).alias('FlightDate')
    ). \
    selectExpr("date_format(to_date(FlightDate, 'yyyyMMdd'), 'EEEE') AS FlightDate"). \
    show(5)

+----------+
|FlightDate|
+----------+
| Wednesday|
|  Thursday|
|  Thursday|
|  Thursday|
|  Thursday|
+----------+
only showing top 5 rows



In [48]:
airtraffic. \
    withColumn("FlightDate", date_format(
                                    to_date(
                                        concat(
                                                col("Year"),
                                                lpad(col("Month"), 2, "0"),
                                                lpad(col("DayOfMonth"), 2, "0")
                                        ), 'yyyyMMdd'
                                    ), 'EEEE'
                            )
    ). \
    filter('FlightDate = "Sunday"'). \
    filter('IsDepDelayed = "YES"'). \
    count()

34708

In [49]:
airtraffic. \
    withColumn("FlightDate",
               concat(col("Year"),
                      lpad(col("Month"), 2, "0"),
                      lpad(col("DayOfMonth"), 2, "0")
                     )
              ). \
    filter((col("IsDepDelayed") == "YES") &
           (date_format(
               to_date("FlightDate", "yyyyMMdd"), "EEEE"
           ) == "Sunday")
          ). \
    count()

34708

In [50]:
airtraffic. \
    withColumn("FlightDate",
               concat(col("Year"),
                      lpad(col("Month"), 2, "0"),
                      lpad(col("DayOfMonth"), 2, "0")
                     )
              ). \
    filter("""
           IsDepDelayed = 'YES' AND 
           date_format(to_date(FlightDate, 'yyyyMMdd'), 'EEEE') = 'Sunday'
           """). \
    count()

34708

##### Get count of flights which are departed late at origin and reach destination early or on time

In [51]:
airtraffic. \
    select('IsDepDelayed', 'IsArrDelayed', 'Cancelled'). \
    distinct(). \
    show()

# So, when both IsDepDelayed and IsArrDelayed is YES, Cancelled is set to 1

+------------+------------+---------+
|IsDepDelayed|IsArrDelayed|Cancelled|
+------------+------------+---------+
|          NO|          NO|        0|
|         YES|         YES|        1|
|          NO|         YES|        0|
|         YES|          NO|        0|
|         YES|         YES|        0|
+------------+------------+---------+



In [53]:
# SQL style 

airtraffic. \
    filter("IsDepDelayed = 'YES' AND IsArrDelayed = 'NO' AND Cancelled = 0"). \
    count()

54233

In [54]:
# API style

airtraffic. \
    filter(
           (col("IsDepDelayed") == "YES") & 
           (col("IsArrDelayed") == "NO") &
           (col("Cancelled") == 0)
          ). \
    count()

# Notice how each filter condition is wrapped inside ()

54233

In [55]:
airtraffic. \
    filter(
           (airtraffic["IsDepDelayed"] == "YES") & 
           (airtraffic.IsArrDelayed == "NO") &
           (airtraffic.Cancelled == 0)
          ). \
    count()

54233

##### Get count of flights which are departed early or on time but arrive late by at least 15 minutes.

In [56]:
airtraffic. \
    filter("IsDepDelayed = 'NO' AND ArrDelay >= 15 AND cancelled = 0"). \
    count()

20705

In [57]:
airtraffic. \
    filter(
           (col("IsDepDelayed") == "NO") & 
           (col("ArrDelay") >= 15)
          ). \
    count()

20705

##### Get number of flights departed late on Sundays as well as on Saturdays. 

In [59]:
airtraffic. \
    withColumn("FlightDate",
               concat(col("Year"),
                      lpad(col("Month"), 2, "0"),
                      lpad(col("DayOfMonth"), 2, "0")
                     )
              ). \
    filter("""
           IsDepDelayed = 'YES' AND Cancelled = 0 AND
           (date_format(to_date(FlightDate, 'yyyyMMdd'), 'EEEE') = 'Saturday'
               OR date_format(to_date(FlightDate, 'yyyyMMdd'), 'EEEE') = 'Sunday'
           )
           """). \
    count()

57873

In [60]:
airtraffic. \
    withColumn("FlightDate",
               concat(col("Year"),
                      lpad(col("Month"), 2, "0"),
                      lpad(col("DayOfMonth"), 2, "0")
                     )
              ). \
    filter(
            (col("IsDepDelayed") == "YES") & (col("Cancelled") == 0) &
            (
                 (date_format(
                       to_date("FlightDate", "yyyyMMdd"), "EEEE"
                 ) == "Saturday") |
                 (date_format(
                       to_date("FlightDate", "yyyyMMdd"), "EEEE"
                 ) == "Sunday")
           )
          ). \
    count()

57873

___now the same problem can be solved using `IN` or `isin`___

i.e.,
##### Get number of flights departed late on Sundays as well as on Saturdays using `IN` or `isin`

In [61]:
# Now isin can be run on top of col type objects
# Let's create a col type object to run help on isin function

c = col('X')
type(c)

pyspark.sql.column.Column

In [62]:
help(c.isin)

Help on method isin in module pyspark.sql.column:

isin(*cols) method of pyspark.sql.column.Column instance
    A boolean expression that is evaluated to true if the value of this
    expression is contained by the evaluated values of the arguments.
    
    >>> df[df.name.isin("Bob", "Mike")].collect()
    [Row(age=5, name='Bob')]
    >>> df[df.age.isin([1, 2, 3])].collect()
    [Row(age=2, name='Alice')]
    
    .. versionadded:: 1.5



In [63]:
airtraffic. \
    withColumn("FlightDate",
               concat(col("Year"),
                      lpad(col("Month"), 2, "0"),
                      lpad(col("DayOfMonth"), 2, "0")
                     )
              ). \
    filter(
            (col("IsDepDelayed") == "YES") & (col("Cancelled") == 0) &
            (date_format(
                   to_date("FlightDate", "yyyyMMdd"), "EEEE"
                   ).isin("Saturday", "Sunday")
            )
          ). \
    count()

57873

In [64]:
airtraffic. \
    withColumn("FlightDate",
               concat(col("Year"),
                      lpad(col("Month"), 2, "0"),
                      lpad(col("DayOfMonth"), 2, "0")
                     )
              ). \
    filter("""
           IsDepDelayed = 'YES' AND Cancelled = 0 AND
           date_format(to_date(FlightDate, 'yyyyMMdd'), 'EEEE') IN
               ('Saturday', 'Sunday')
           """). \
    count()

57873

##### Get count of flights departed from following major airports - ORD, DFW, ATL, LAX, SFO.

In [65]:
airtraffic. \
    filter("Origin IN ('ORD', 'DFW', 'ATL', 'LAX', 'SFO')"). \
    count()

118212

In [66]:
airtraffic. \
    filter(col("Origin").isin("ORD", "DFW", "ATL", "LAX", "SFO")). \
    count()

118212

#### LIKE Operator or like Function

In [67]:
employees = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 10,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [68]:
employeesDF = spark. \
    createDataFrame(employees,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, bonus STRING, nationality STRING,
                    phone_number STRING, ssn STRING"""
                   )

In [69]:
employeesDF.show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



##### Get employees whose first name starts with Sco

In [70]:
employeesDF. \
    filter("first_name LIKE 'Sco%'"). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



In [71]:
employeesDF. \
    filter("upper(first_name) LIKE 'SCO%'"). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



In [72]:
# API style

employeesDF. \
    filter(col('first_name').like('Sco%')). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



In [73]:
employeesDF. \
    filter(upper(col('first_name')).like('SCO%')). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



##### Get employees where first name contain `ott` irrespective of case.

In [77]:
employeesDF. \
    filter(lower(col("first_name")).like("%ott%")). \
    show()

+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|   phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states|+1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+-----+-------------+---------------+-----------+



##### Get employees whose phone number does not start with +44

In [78]:
employeesDF. \
    filter("phone_number NOT LIKE '+44%'"). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|        India|+91 234 567 8901|456 78 9123|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [79]:
employeesDF. \
    filter(~ col('phone_number').like('+44%')). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|        India|+91 234 567 8901|456 78 9123|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



##### Get count of flights departed late between 2008 January 1st to January 9th using FlightDate.

In [83]:
airtraffic. \
    select(concat(col("Year"), lpad(col("Month"), 2, "0"), lpad(col("DayOfMonth"), 2, "0")).alias("FlightDate")). \
    show(5)

+----------+
|FlightDate|
+----------+
|  20080116|
|  20080117|
|  20080117|
|  20080117|
|  20080117|
+----------+
only showing top 5 rows



In [80]:
# Using BETWEEN AND

airtraffic. \
    withColumn("FlightDate",
               concat(col("Year"),
                      lpad(col("Month"), 2, "0"),
                      lpad(col("DayOfMonth"), 2, "0")
                     )
              ). \
    filter("""
           IsDepDelayed = 'YES' AND 
           Cancelled = 0 AND
           FlightDate BETWEEN 20080101 AND 20080109
          """). \
    count()

86180

In [84]:
# Using >= <=

airtraffic. \
    withColumn("FlightDate",
               concat(col("Year"),
                      lpad(col("Month"), 2, "0"),
                      lpad(col("DayOfMonth"), 2, "0")
                     )
              ). \
    filter("""
           IsDepDelayed = 'YES' AND 
           Cancelled = 0 AND
           FlightDate >= 20080101 AND
           FlightDate <= 20080109
          """). \
    count()

86180

In [85]:
airtraffic. \
    withColumn("FlightDate",
               concat(col("Year"),
                      lpad(col("Month"), 2, "0"),
                      lpad(col("DayOfMonth"), 2, "0")
                     )
              ). \
    filter((col("IsDepDelayed") == "YES") & 
           (col("Cancelled") == 0) &
           (col("FlightDate").between(20080101, 20080109))
          ). \
    count()

86180

#### Handling Null

##### Get all the records where bonus is not null or not empty.

In [87]:
employeesDF. \
    filter("bonus IS NOT NULL AND bonus <> ''"). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [88]:
employeesDF. \
    filter("!(bonus IS NULL OR bonus = '')"). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [89]:
employeesDF. \
    filter((col('bonus').isNotNull()) &
           (col('bonus') != '')
          ). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



In [90]:
# Using negation ~

employeesDF. \
    filter((~ col('bonus').isNull()) &
           (col('bonus') != '')
          ). \
    show()

+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|  nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10|united states| +1 123 456 7890|123 45 6789|
|          4|      Bill|    Gomes|1500.0|   10|    AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+-------------+----------------+-----------+



##### Get all the records where bonus is null or empty.

In [91]:
employeesDF. \
    filter("nullif(bonus, '') IS NULL"). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [92]:
employeesDF. \
    filter((col('bonus').isNull()) |
           (col('bonus') == '')
          ). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [93]:
# casting the empty string to int will fail and return null

employeesDF. \
    filter((col('bonus').cast('int').isNull()) 
          ). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+

