In [47]:
import findspark
findspark.init('/home/shashank/spark-2.3.2-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession

In [48]:
spark = SparkSession.builder.appName("Miss").getOrCreate()

In [49]:
df = spark.read.csv('Spark_DataFrames/ContainsNull.csv', inferSchema=True, header=True)

In [50]:
df.createOrReplaceTempView('null_sql')

In [51]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [52]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [53]:
#Drop missing data

df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [54]:
#Specify threshold
df.na.drop(thresh=2).show() #atleast 2 non-nul values

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [55]:
df.na.drop(how = 'all').show() #drop observations with all null values

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [56]:
#subset to consider only specific columns
df.na.drop(how = 'all', subset=['Name', 'Sales']).show() 

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



# SQL

In [57]:
spark.sql("FROM null_sql SELECT * WHERE Name IS NOT NULL").show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp4|Cindy|456.0|
+----+-----+-----+



In [58]:
spark.sql("FROM null_sql SELECT * WHERE Name IS NOT NULL OR Sales IS NOT NULL").show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



**FILL IN**

In [59]:
df1 = df.na.fill("No Name", ['Name'])

In [60]:
from pyspark.sql.functions import mean
mean_values = spark.sql("FROM null_sql SELECT mean(Sales)").collect()

In [61]:
mean_sales = mean_values[0][0]

In [65]:
df2 = df1.na.fill(mean_sales, ['Sales'])

In [66]:
df2.show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John|400.5|
|emp2|No Name|400.5|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [67]:
#combine
df.na.fill("No Name", ['Name']).na.fill(spark.sql("FROM null_sql SELECT mean(Sales)").collect()[0][0], ['Sales']).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John|400.5|
|emp2|No Name|400.5|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [72]:
#Only python code
df.na.fill("No Name", ['Name']).na.fill(df.select(mean(df['Sales'])).collect()[0][0], ['Sales']).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John|400.5|
|emp2|No Name|400.5|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+

