In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Processing Column Data'). \
    master('yarn'). \
    getOrCreate()

In [2]:
from pyspark.sql.functions import *

In [3]:
# read data
orders = spark.read \
    .format("csv") \
    .option("path", "/public/retail_db/orders") \
    .schema('order_id INT, order_date STRING, order_customer_id INT, order_status STRING') \
    .load()

In [4]:
orders.show(truncate=False)

+--------+---------------------+-----------------+---------------+
|order_id|order_date           |order_customer_id|order_status   |
+--------+---------------------+-----------------+---------------+
|1       |2013-07-25 00:00:00.0|11599            |CLOSED         |
|2       |2013-07-25 00:00:00.0|256              |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111            |COMPLETE       |
|4       |2013-07-25 00:00:00.0|8827             |CLOSED         |
|5       |2013-07-25 00:00:00.0|11318            |COMPLETE       |
|6       |2013-07-25 00:00:00.0|7130             |COMPLETE       |
|7       |2013-07-25 00:00:00.0|4530             |COMPLETE       |
|8       |2013-07-25 00:00:00.0|2911             |PROCESSING     |
|9       |2013-07-25 00:00:00.0|5657             |PENDING_PAYMENT|
|10      |2013-07-25 00:00:00.0|5648             |PENDING_PAYMENT|
|11      |2013-07-25 00:00:00.0|918              |PAYMENT_REVIEW |
|12      |2013-07-25 00:00:00.0|1837             |CLOSED      

In [5]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [6]:
# Add a column order_month

orders.select("*", date_format("order_date", "MMM").alias("order_month")).show(10, truncate=False)

+--------+---------------------+-----------------+---------------+-----------+
|order_id|order_date           |order_customer_id|order_status   |order_month|
+--------+---------------------+-----------------+---------------+-----------+
|1       |2013-07-25 00:00:00.0|11599            |CLOSED         |Jul        |
|2       |2013-07-25 00:00:00.0|256              |PENDING_PAYMENT|Jul        |
|3       |2013-07-25 00:00:00.0|12111            |COMPLETE       |Jul        |
|4       |2013-07-25 00:00:00.0|8827             |CLOSED         |Jul        |
|5       |2013-07-25 00:00:00.0|11318            |COMPLETE       |Jul        |
|6       |2013-07-25 00:00:00.0|7130             |COMPLETE       |Jul        |
|7       |2013-07-25 00:00:00.0|4530             |COMPLETE       |Jul        |
|8       |2013-07-25 00:00:00.0|2911             |PROCESSING     |Jul        |
|9       |2013-07-25 00:00:00.0|5657             |PENDING_PAYMENT|Jul        |
|10      |2013-07-25 00:00:00.0|5648             |PE

In [7]:
orders.withColumn("order_month", date_format("order_date", "MMM")).show(10, truncate=False)

+--------+---------------------+-----------------+---------------+-----------+
|order_id|order_date           |order_customer_id|order_status   |order_month|
+--------+---------------------+-----------------+---------------+-----------+
|1       |2013-07-25 00:00:00.0|11599            |CLOSED         |Jul        |
|2       |2013-07-25 00:00:00.0|256              |PENDING_PAYMENT|Jul        |
|3       |2013-07-25 00:00:00.0|12111            |COMPLETE       |Jul        |
|4       |2013-07-25 00:00:00.0|8827             |CLOSED         |Jul        |
|5       |2013-07-25 00:00:00.0|11318            |COMPLETE       |Jul        |
|6       |2013-07-25 00:00:00.0|7130             |COMPLETE       |Jul        |
|7       |2013-07-25 00:00:00.0|4530             |COMPLETE       |Jul        |
|8       |2013-07-25 00:00:00.0|2911             |PROCESSING     |Jul        |
|9       |2013-07-25 00:00:00.0|5657             |PENDING_PAYMENT|Jul        |
|10      |2013-07-25 00:00:00.0|5648             |PE

In [8]:
# Filter
orders \
    .filter(date_format("order_date", "yyyyMM") == "201401") \
    .show(10, truncate=False)

+--------+---------------------+-----------------+---------------+
|order_id|order_date           |order_customer_id|order_status   |
+--------+---------------------+-----------------+---------------+
|25876   |2014-01-01 00:00:00.0|3414             |PENDING_PAYMENT|
|25877   |2014-01-01 00:00:00.0|5549             |PENDING_PAYMENT|
|25878   |2014-01-01 00:00:00.0|9084             |PENDING        |
|25879   |2014-01-01 00:00:00.0|5118             |PENDING        |
|25880   |2014-01-01 00:00:00.0|10146            |CANCELED       |
|25881   |2014-01-01 00:00:00.0|3205             |PENDING_PAYMENT|
|25882   |2014-01-01 00:00:00.0|4598             |COMPLETE       |
|25883   |2014-01-01 00:00:00.0|11764            |PENDING        |
|25884   |2014-01-01 00:00:00.0|7904             |PENDING_PAYMENT|
|25885   |2014-01-01 00:00:00.0|7253             |PENDING        |
+--------+---------------------+-----------------+---------------+
only showing top 10 rows



In [9]:
# Group by
orders. \
    groupBy(date_format('order_date', 'yyyyMM').alias('order_month')). \
    count(). \
    show(10, truncate=False)

+-----------+-----+
|order_month|count|
+-----------+-----+
|201401     |5908 |
|201405     |5467 |
|201312     |5892 |
|201310     |5335 |
|201311     |6381 |
|201307     |1533 |
|201407     |4468 |
|201403     |5778 |
|201404     |5657 |
|201402     |5635 |
+-----------+-----+
only showing top 10 rows



In [10]:
# Creating a dummy dataframe for checking functions
l = [("X", )]

df = spark.createDataFrame(l, "dummy STRING")

In [11]:
df.printSchema()

root
 |-- dummy: string (nullable = true)



In [12]:
df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [13]:
# use the dummy dataframe to get current date

df.select(current_date().alias("current date")). \
    show()

+------------+
|current date|
+------------+
|  2021-06-22|
+------------+



##### collection of employees

In [14]:
employees = [
    (1, "Scott", "Tiger", 1000.0, 
      "united states", "+1 123 456 7890", "123 45 6789"
    ),
     (2, "Henry", "Ford", 1250.0, 
      "India", "+91 234 567 8901", "456 78 9123"
     ),
     (3, "Nick", "Junior", 750.0, 
      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
     ),
     (4, "Bill", "Gomes", 1500.0, 
      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
     )
]

In [15]:
# create a dataframe

employeesDF = spark.createDataFrame(employees, """employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, nationality STRING,
                    phone_number STRING, ssn STRING""")

In [16]:
employeesDF.show(truncate=False)

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|nationality   |phone_number    |ssn        |
+-----------+----------+---------+------+--------------+----------------+-----------+
|1          |Scott     |Tiger    |1000.0|united states |+1 123 456 7890 |123 45 6789|
|2          |Henry     |Ford     |1250.0|India         |+91 234 567 8901|456 78 9123|
|3          |Nick      |Junior   |750.0 |united KINGDOM|+44 111 111 1111|222 33 4444|
|4          |Bill      |Gomes    |1500.0|AUSTRALIA     |+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [17]:
employeesDF. \
    groupBy("nationality"). \
    count(). \
    show()

+--------------+-----+
|   nationality|count|
+--------------+-----+
|         India|    1|
|united KINGDOM|    1|
| united states|    1|
|     AUSTRALIA|    1|
+--------------+-----+



In [18]:
employeesDF. \
    orderBy("employee_id") \
    .show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



##### Covert string type to column type

Let's say we want to convert first name and last name to upper case

In [19]:
employeesDF.select(upper("first_name"), upper("last_name")). \
    show()

# This will fail as the upper function can be applied to columns, not string expressions
# notice the error: Method upper([class java.lang.String]) does not exist

Py4JError: An error occurred while calling z:org.apache.spark.sql.functions.upper. Trace:
py4j.Py4JException: Method upper([class java.lang.String]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:339)
	at py4j.Gateway.invoke(Gateway.java:276)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



In [20]:
employeesDF.select(upper(col("first_name")), upper(col("last_name"))). \
    show()

# col() coverts string expression to column type

+-----------------+----------------+
|upper(first_name)|upper(last_name)|
+-----------------+----------------+
|            SCOTT|           TIGER|
|            HENRY|            FORD|
|             NICK|          JUNIOR|
|             BILL|           GOMES|
+-----------------+----------------+



In [21]:
# This will fail as the function desc is available only on column type.
employeesDF. \
    orderBy("employee_id".desc()). \
    show()

AttributeError: 'str' object has no attribute 'desc'

In [22]:
# We can invoke desc on columns which are of type column
employeesDF. \
    orderBy(col("employee_id").desc()). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [23]:
# Alternative - we can also refer column names using Data Frame like this
employeesDF. \
    orderBy(upper(employeesDF['first_name']).alias('first_name')). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [24]:
# Alternative - we can also refer column names using Data Frame like this
employeesDF. \
    orderBy(upper(employeesDF.first_name).alias('first_name')). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+--------------+----------------+-----------+



##### * Extract last 4 digits from the phone number.

##### * Extract last 4 digits from SSN.

In [25]:
employeesDF. \
    select("employee_id", "phone_number", "ssn"). \
    withColumn("phone_num_last4digits", substring(col("phone_number"), -4, 4)). \
    withColumn("ssn_last4digits", substring(col("ssn"), -4, 4)). \
    show(truncate=False)

+-----------+----------------+-----------+---------------------+---------------+
|employee_id|phone_number    |ssn        |phone_num_last4digits|ssn_last4digits|
+-----------+----------------+-----------+---------------------+---------------+
|1          |+1 123 456 7890 |123 45 6789|7890                 |6789           |
|2          |+91 234 567 8901|456 78 9123|8901                 |9123           |
|3          |+44 111 111 1111|222 33 4444|1111                 |4444           |
|4          |+61 987 654 3210|789 12 6118|3210                 |6118           |
+-----------+----------------+-----------+---------------------+---------------+



In [26]:
# df dummay dataframe

df.select(split(lit("Hello World, how are you"), " ")). \
    show(truncate=False)

+----------------------------------+
|split(Hello World, how are you,  )|
+----------------------------------+
|[Hello, World,, how, are, you]    |
+----------------------------------+



In [27]:
df.select(split(lit("Hello World, how are you"), " ")[2]). \
    show(truncate=False)

+-------------------------------------+
|split(Hello World, how are you,  )[2]|
+-------------------------------------+
|how                                  |
+-------------------------------------+



In [28]:
df.select(explode(split(lit("Hello World, how are you"), " ")).alias('word')). \
    show(truncate=False)

+------+
|word  |
+------+
|Hello |
|World,|
|how   |
|are   |
|you   |
+------+



##### Create an employees dataframe with multiple phone numbers for each employee

Now, count the number phone numbers each employee has

In [29]:
employees2 = [(1, "Scott", "Tiger", 1000.0, 
                      "united states", "+1 123 456 7890,+1 234 567 8901", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, 
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, 
                      "united KINGDOM", "+44 111 111 1111,+44 222 222 2222", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 
                      "AUSTRALIA", "+61 987 654 3210,+61 876 543 2109", "789 12 6118"
                     )
                ]

In [30]:
employeesDF2 = spark. \
    createDataFrame(employees2,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, nationality STRING,
                    phone_numbers STRING, ssn STRING"""
                   )

In [31]:
employeesDF2. \
    select('employee_id', 'phone_numbers'). \
    show(truncate=False)

+-----------+---------------------------------+
|employee_id|phone_numbers                    |
+-----------+---------------------------------+
|1          |+1 123 456 7890,+1 234 567 8901  |
|2          |+91 234 567 8901                 |
|3          |+44 111 111 1111,+44 222 222 2222|
|4          |+61 987 654 3210,+61 876 543 2109|
+-----------+---------------------------------+



In [32]:
employeesDF2. \
    select('employee_id', explode(split(col('phone_numbers'), ","))). \
    groupBy("employee_id"). \
    count(). \
    select('employee_id', col("count").alias("number of phone numbers")). \
    show(truncate=False)

+-----------+-----------------------+
|employee_id|number of phone numbers|
+-----------+-----------------------+
|1          |2                      |
|3          |2                      |
|4          |2                      |
|2          |1                      |
+-----------+-----------------------+



In [33]:
l = [("   Hello.    ",)]

In [34]:
df2 = spark.createDataFrame(l).toDF("dummy")

In [35]:
df2.show()

+-------------+
|        dummy|
+-------------+
|   Hello.    |
+-------------+



In [36]:
df2.withColumn("ltrim", ltrim(col("dummy"))). \
  withColumn("rtrim", rtrim(col("dummy"))). \
  withColumn("trim", trim(col("dummy"))). \
  show()

+-------------+----------+---------+------+
|        dummy|     ltrim|    rtrim|  trim|
+-------------+----------+---------+------+
|   Hello.    |Hello.    |   Hello.|Hello.|
+-------------+----------+---------+------+



In [37]:
# if we do not specify trimStr, it will be defaulted to space
df2.withColumn("ltrim", expr("ltrim(dummy)")). \
  withColumn("rtrim", expr("rtrim('.', rtrim(dummy))")). \
  withColumn("trim", trim(col("dummy"))). \
  show()

+-------------+----------+--------+------+
|        dummy|     ltrim|   rtrim|  trim|
+-------------+----------+--------+------+
|   Hello.    |Hello.    |   Hello|Hello.|
+-------------+----------+--------+------+



In [38]:
df2.withColumn("ltrim", expr("trim(LEADING ' ' FROM dummy)")). \
  withColumn("rtrim", expr("trim(TRAILING '.' FROM rtrim(dummy))")). \
  withColumn("trim", expr("trim(BOTH ' ' FROM dummy)")). \
  show()

+-------------+----------+--------+------+
|        dummy|     ltrim|   rtrim|  trim|
+-------------+----------+--------+------+
|   Hello.    |Hello.    |   Hello|Hello.|
+-------------+----------+--------+------+



#### Dealing with null values

* We can use `coalesce` to return first non null value.

* We also have traditional SQL style functions such as `nvl`. However, they can be used either with `expr` or `selectExpr`.

In [39]:
employees2 = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 10,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [40]:
employeesDF2 = spark. \
    createDataFrame(employees2,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, bonus STRING, nationality STRING,
                    phone_number STRING, ssn STRING"""
                   )

In [41]:
employeesDF2.show()

# Notice the bonus column
# It has an empty string '' as well as null

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [42]:
# Replace the null with 0 using coalesce

employeesDF2. \
    withColumn('bonus', coalesce('bonus', 0)). \
    show()
# This will fail as 0 is not a column
# Notice the error: not a string or column: 0 of type <class 'int'>. For column literals, use 'lit'

TypeError: Invalid argument, not a string or column: 0 of type <class 'int'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

In [43]:
employeesDF2. \
    withColumn('bonus', coalesce(col('bonus'), lit("0"))). \
    show()
# null is replaced by 0

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|    0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [44]:
# Now let's replace the empty string '' by 0 using coalesce

# Remember that, when we cast a column, if casting is not possible, the it returns null
# similarly, if we cast the column bonus to int, the empty string '' won't be casted and will have null
# which can be easily replaced by 0 using coalesce
employeesDF2. \
    withColumn('bonus', coalesce(col('bonus').cast("int"), lit("0"))). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|    0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|    0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



##### Using nvl with expr/selectExpr

In [46]:
# Replace the null with 0, using nvl

employeesDF2. \
    withColumn('bonus', expr("nvl(bonus, 0)")). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|    0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [47]:
# Now let's replace the empty string '' by 0 using nvl and nullif
# nullif(a, b) -> if a=b returns null, else returns a

employeesDF2. \
    withColumn('bonus', expr("nvl(nullif(bonus, ''), 0)")). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|    0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|    0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



##### get updated salary: salary + salary * bonus /100

In [48]:
employeesDF2. \
    withColumn('updated salary', col('salary') + (col('salary') * coalesce(col('bonus').cast('int'), lit(0)) / 100)). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+--------------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|updated salary|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+--------------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|        1100.0|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|        1250.0|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|         750.0|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|        1650.0|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+--------------+



##### Update the bonus column using CASE WHEN

In [49]:
employeesDF2. \
    withColumn('bonus', expr("""CASE WHEN bonus IS NULL OR bonus = '' THEN 0
                                ELSE bonus
                                END"""
                            )
              ). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|    0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|    0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [50]:
employeesDF2. \
    withColumn(
        'bonus',
        when((col('bonus').isNull()) | (col('bonus') == lit('')), 0).otherwise(col('bonus'))
    ). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|    0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|    0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+

