## Spark DataFrames Missing Data - Tutorial 4
- Robert Esteves
- 2018-10-29

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('miss').getOrCreate()

In [3]:
source_file = '/home/robert/Downloads/Python-and-Spark-for-Big-Data-master/Spark_DataFrames/ContainsNull.csv'

In [4]:
df = spark.read.csv(source_file, header=True, inferSchema=True)

In [5]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [6]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



## Option 1: Drop records missing data

In [7]:
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



## Specify a treshold for the record drop operation. For example 2: (thresh=2) The record must have at least two non-null values to be retained in the dataset

In [8]:
df.na.drop(thresh=2).show()  # record two is dropped

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## Drop any record that has <i><u>any</u></i> null values

In [9]:
df.na.drop(how='any').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



## Drop any record where all columns are null

In [10]:
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## Subset parameter

In [12]:
df.na.drop(subset=['Sales']).show()  # All that matters is that your sales are missing

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## Fill in missing values

In [14]:
from pyspark.sql.functions import mean

In [15]:
mean_val = df.select(mean(df['Sales'])).collect()

In [16]:
mean_val

[Row(avg(Sales)=400.5)]

## Using the indexing method

In [17]:
mean_val[0][0]

400.5

## Using the dictionary method

In [20]:
results = mean_val[0]

In [22]:
results.asDict()

{'avg(Sales)': 400.5}

In [23]:
results.asDict()['avg(Sales)']

400.5

In [24]:
mean_sales = results.asDict()['avg(Sales)']

In [25]:
mean_sales

400.5

In [28]:
type(mean_sales)

float

In [27]:
df.na.fill(mean_sales, ['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

