In [88]:
#create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").\
                                     appName("spark_on_docker").\
                                     getOrCreate()

In [89]:
df = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("work/TheDefinitiveGuide/Spark-The-Definitive-Guide/data/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")


root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [94]:
from pyspark.sql.functions import coalesce
df.select(coalesce(col("Description"), col("CustomerId")), "Description", "CustomerId").show()

+---------------------------------+--------------------+----------+
|coalesce(Description, CustomerId)|         Description|CustomerId|
+---------------------------------+--------------------+----------+
|             WHITE HANGING HEA...|WHITE HANGING HEA...|   17850.0|
|              WHITE METAL LANTERN| WHITE METAL LANTERN|   17850.0|
|             CREAM CUPID HEART...|CREAM CUPID HEART...|   17850.0|
|             KNITTED UNION FLA...|KNITTED UNION FLA...|   17850.0|
|             RED WOOLLY HOTTIE...|RED WOOLLY HOTTIE...|   17850.0|
|             SET 7 BABUSHKA NE...|SET 7 BABUSHKA NE...|   17850.0|
|             GLASS STAR FROSTE...|GLASS STAR FROSTE...|   17850.0|
|             HAND WARMER UNION...|HAND WARMER UNION...|   17850.0|
|             HAND WARMER RED P...|HAND WARMER RED P...|   17850.0|
|             ASSORTED COLOUR B...|ASSORTED COLOUR B...|   13047.0|
|             POPPY'S PLAYHOUSE...|POPPY'S PLAYHOUSE...|   13047.0|
|             POPPY'S PLAYHOUSE...|POPPY'S PLAYH

In [98]:
spark.sql("SELECT \
ifnull(null, 'return_value'),\
nullif('value', 'value'),\
nvl(null, 'return_value'),\
nvl2('not_null', 'return_value', 'else_value')\
FROM dfTable LIMIT 1\
").show()


+--------------------------+--------------------+-----------------------+----------------------------------------+
|ifnull(NULL, return_value)|nullif(value, value)|nvl(NULL, return_value)|nvl2(not_null, return_value, else_value)|
+--------------------------+--------------------+-----------------------+----------------------------------------+
|              return_value|                null|           return_value|                            return_value|
+--------------------------+--------------------+-----------------------+----------------------------------------+



The simplest function is drop, which removes rows that contain nulls. 
The default is to drop any row in which any value is null

In [105]:
df.na.drop("any")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

Specifying "any" as an argument drops a row if any of the values are null. 
Using “all” drops the row only if all values are null or NaN for that row:

In [108]:
df.na.drop("all")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

We can also apply this to certain sets of columns by passing in an array of columns:

In [109]:
df.na.drop("all", subset=["StockCode", "InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

We can also do this with with a Scala Map, where the key is the column name and the value is the
value we would like to use to fill null values:

In [110]:
fill_cols_vals = {"StockCode": 5, "Description" : "No Value"}
df.na.fill(fill_cols_vals)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

replace

Iqe case is to replace all values in a certain column according to their current value. The only requirement is that this value be the same type as the original value:

In [111]:
df.na.replace([""], ["UNKNOWN"], "Description")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]