# Working with Null or NAs

In [1]:
from pyspark import SparkContext

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType

In [2]:
sc = SparkContext.getOrCreate()

In [4]:
spark = SparkSession.builder.getOrCreate()

------

In [5]:
# manually create dataframe

df = sc.parallelize([
            Row(server_name='101 server', cpu_utilization=85, session_count=80),\
            Row(server_name='101 server', cpu_utilization=80, session_count=90),\
            Row(server_name='102 server', cpu_utilization=85, session_count=80),\
            Row(server_name='102 server', cpu_utilization=85, session_count=80)
]).toDF()

In [6]:
df.show()

+-----------+---------------+-------------+
|server_name|cpu_utilization|session_count|
+-----------+---------------+-------------+
| 101 server|             85|           80|
| 101 server|             80|           90|
| 102 server|             85|           80|
| 102 server|             85|           80|
+-----------+---------------+-------------+



# Create a new dataframe with One Additional Column

In [7]:
# add a new column to original dataframe
# value is literal None and cast it as a String type

df_na = df.withColumn('na_column', lit(None).cast(StringType()))

In [8]:
df_na.show()

+-----------+---------------+-------------+---------+
|server_name|cpu_utilization|session_count|na_column|
+-----------+---------------+-------------+---------+
| 101 server|             85|           80|     null|
| 101 server|             80|           90|     null|
| 102 server|             85|           80|     null|
| 102 server|             85|           80|     null|
+-----------+---------------+-------------+---------+



# Filling NA value

In [9]:
df_na.fillna('A').show()

+-----------+---------------+-------------+---------+
|server_name|cpu_utilization|session_count|na_column|
+-----------+---------------+-------------+---------+
| 101 server|             85|           80|        A|
| 101 server|             80|           90|        A|
| 102 server|             85|           80|        A|
| 102 server|             85|           80|        A|
+-----------+---------------+-------------+---------+



# Create a new data frame with NA value and Non NA value

In [10]:
df2 = df_na.fillna('A').union(df_na)

In [11]:
df2.show()

+-----------+---------------+-------------+---------+
|server_name|cpu_utilization|session_count|na_column|
+-----------+---------------+-------------+---------+
| 101 server|             85|           80|        A|
| 101 server|             80|           90|        A|
| 102 server|             85|           80|        A|
| 102 server|             85|           80|        A|
| 101 server|             85|           80|     null|
| 101 server|             80|           90|     null|
| 102 server|             85|           80|     null|
| 102 server|             85|           80|     null|
+-----------+---------------+-------------+---------+



# Drop NA values

In [13]:
df2.na.drop().show()

+-----------+---------------+-------------+---------+
|server_name|cpu_utilization|session_count|na_column|
+-----------+---------------+-------------+---------+
| 101 server|             85|           80|        A|
| 101 server|             80|           90|        A|
| 102 server|             85|           80|        A|
| 102 server|             85|           80|        A|
+-----------+---------------+-------------+---------+



### Drop NA values using SQL

In [20]:
df2.createOrReplaceTempView('vw_na_table')

In [21]:
spark.sql('SELECT * FROM vw_na_table').show()

+-----------+---------------+-------------+---------+
|server_name|cpu_utilization|session_count|na_column|
+-----------+---------------+-------------+---------+
| 101 server|             85|           80|        A|
| 101 server|             80|           90|        A|
| 102 server|             85|           80|        A|
| 102 server|             85|           80|        A|
| 101 server|             85|           80|     null|
| 101 server|             80|           90|     null|
| 102 server|             85|           80|     null|
| 102 server|             85|           80|     null|
+-----------+---------------+-------------+---------+



In [23]:
spark.sql('SELECT * FROM vw_na_table\
               WHERE na_column IS NOT NULL \
              ').show()

+-----------+---------------+-------------+---------+
|server_name|cpu_utilization|session_count|na_column|
+-----------+---------------+-------------+---------+
| 101 server|             85|           80|        A|
| 101 server|             80|           90|        A|
| 102 server|             85|           80|        A|
| 102 server|             85|           80|        A|
+-----------+---------------+-------------+---------+

