References

https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameNaFunctions<br>

In [2]:
from pyspark.sql import *

#### Get a dataframe for notebook tasks

In [4]:
%run ./adb_3_ingest_to_df

#### Cleanup nulls, bad values

In [6]:
# How many nulls do we have in a column

df_flights_full.where(df_flights_full["ArrDelay"].isNull()).count()

In [7]:
# How many non-nulls do we have in a column - alternate syntax

df_flights_full.filter("Carrier is not NULL").count()

In [8]:
# Drop rows where ALL columns contain null

df1 = df_flights_full.na.drop(how='all')
df1.count()

In [9]:
# Drop rows where ANY column contains null

df2 = df_flights_full.na.drop(how='any')
df2.count()

In [10]:
# Fill nulls in an int column with value 0

df3 = df_flights_full.na.fill(0, ["OriginAirportID"])

In [11]:
# Replace some values
df4 = df3\
  .na.replace("Palau", "PW", ["state"])\
  .na.replace("Federated States of Micronesia", "FSM", ["state"])

df4.count()

In [12]:
group_by_column = "OriginState"
count_column = "n"

# Group by state so we can count airports per state; rename count column (confusing with describe() otherwise); sort by count descending
# We also filter to US states only so we can use the Databricks map visualization
df4_usstates_only = df4\
  .filter("OriginState NOT IN ('DC', 'FSM', 'PR', 'PW', 'TT', 'VI')")\
  .groupBy(group_by_column)\
  .count()\
  .withColumnRenamed("count", count_column)\
  .sort(group_by_column, ascending=True)

In [13]:
display(df4_usstates_only)