In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HandlingMissingData').getOrCreate()

In [2]:
spark

In [3]:
df = spark.read.csv("../data/datawithnull.csv", header=True,inferSchema=True)
df.show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| David|  50|        20| 20000|
|Daniel|  25|         2|  3000|
|  John|  30|        13|  5000|
| Smith|  43|        15| 60000|
|   Joe|  40|      null| 50000|
|  Kyle|null|        60|  null|
|  null|  50|        30|  null|
|  null|null|        10|  null|
|  null|null|      null|  null|
|  null|null|      null|  null|
|  null|null|      null|  null|
|  null|null|      null|  null|
+------+----+----------+------+



In [4]:
### DROPPING THE COLUMNS 
df.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  50|        20| 20000|
|  25|         2|  3000|
|  30|        13|  5000|
|  43|        15| 60000|
|  40|      null| 50000|
|null|        60|  null|
|  50|        30|  null|
|null|        10|  null|
|null|      null|  null|
|null|      null|  null|
|null|      null|  null|
|null|      null|  null|
+----+----------+------+



In [5]:
df.show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| David|  50|        20| 20000|
|Daniel|  25|         2|  3000|
|  John|  30|        13|  5000|
| Smith|  43|        15| 60000|
|   Joe|  40|      null| 50000|
|  Kyle|null|        60|  null|
|  null|  50|        30|  null|
|  null|null|        10|  null|
|  null|null|      null|  null|
|  null|null|      null|  null|
|  null|null|      null|  null|
|  null|null|      null|  null|
+------+----+----------+------+



In [6]:
df.na.drop().show() 

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| David| 50|        20| 20000|
|Daniel| 25|         2|  3000|
|  John| 30|        13|  5000|
| Smith| 43|        15| 60000|
+------+---+----------+------+



In [7]:
### DROP HAS 3 OPTIONS 
### how=any   thresh=None.  subset=NOne


In [8]:
### DEFAULT VALUE IS how = 'any'
df.na.drop(how='any').show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| David| 50|        20| 20000|
|Daniel| 25|         2|  3000|
|  John| 30|        13|  5000|
| Smith| 43|        15| 60000|
+------+---+----------+------+



In [9]:
df.na.drop(how='all').show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| David|  50|        20| 20000|
|Daniel|  25|         2|  3000|
|  John|  30|        13|  5000|
| Smith|  43|        15| 60000|
|   Joe|  40|      null| 50000|
|  Kyle|null|        60|  null|
|  null|  50|        30|  null|
|  null|null|        10|  null|
+------+----+----------+------+



In [10]:
### THRESHOLD 
df.na.drop(how="any",thresh=3).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| David| 50|        20| 20000|
|Daniel| 25|         2|  3000|
|  John| 30|        13|  5000|
| Smith| 43|        15| 60000|
|   Joe| 40|      null| 50000|
+------+---+----------+------+



In [11]:
### SUBSET 
df.na.drop(how='any', subset=['Name']).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| David|  50|        20| 20000|
|Daniel|  25|         2|  3000|
|  John|  30|        13|  5000|
| Smith|  43|        15| 60000|
|   Joe|  40|      null| 50000|
|  Kyle|null|        60|  null|
+------+----+----------+------+



In [12]:
### FILLING THE MISSING VALUES
df.na.fill('MISSING_VALUE').show()


+-------------+----+----------+------+
|         Name| Age|Experience|Salary|
+-------------+----+----------+------+
|        David|  50|        20| 20000|
|       Daniel|  25|         2|  3000|
|         John|  30|        13|  5000|
|        Smith|  43|        15| 60000|
|          Joe|  40|      null| 50000|
|         Kyle|null|        60|  null|
|MISSING_VALUE|  50|        30|  null|
|MISSING_VALUE|null|        10|  null|
|MISSING_VALUE|null|      null|  null|
|MISSING_VALUE|null|      null|  null|
|MISSING_VALUE|null|      null|  null|
|MISSING_VALUE|null|      null|  null|
+-------------+----+----------+------+



In [21]:
df.na.fill('MV').show()


+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| David|  50|        20| 20000|
|Daniel|  25|         2|  3000|
|  John|  30|        13|  5000|
| Smith|  43|        15| 60000|
|   Joe|  40|      null| 50000|
|  Kyle|null|        60|  null|
|    MV|  50|        30|  null|
|    MV|null|        10|  null|
|    MV|null|      null|  null|
|    MV|null|      null|  null|
|    MV|null|      null|  null|
|    MV|null|      null|  null|
+------+----+----------+------+



In [23]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
            inputCols = ['Age','Experience','Salary'],
            outputCols = ["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
).setStrategy("mean")

In [24]:
imputer.fit(df).transform(df).show()

+------+----+----------+------+-----------+------------------+--------------+
|  Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+------+----+----------+------+-----------+------------------+--------------+
| David|  50|        20| 20000|         50|                20|         20000|
|Daniel|  25|         2|  3000|         25|                 2|          3000|
|  John|  30|        13|  5000|         30|                13|          5000|
| Smith|  43|        15| 60000|         43|                15|         60000|
|   Joe|  40|      null| 50000|         40|                21|         50000|
|  Kyle|null|        60|  null|         39|                60|         27600|
|  null|  50|        30|  null|         50|                30|         27600|
|  null|null|        10|  null|         39|                10|         27600|
|  null|null|      null|  null|         39|                21|         27600|
|  null|null|      null|  null|         39|                21|  

In [25]:
imputer = Imputer(
            inputCols = ['Age','Experience','Salary'],
            outputCols = ["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
).setStrategy("median")
imputer.fit(df).transform(df).show()

+------+----+----------+------+-----------+------------------+--------------+
|  Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+------+----+----------+------+-----------+------------------+--------------+
| David|  50|        20| 20000|         50|                20|         20000|
|Daniel|  25|         2|  3000|         25|                 2|          3000|
|  John|  30|        13|  5000|         30|                13|          5000|
| Smith|  43|        15| 60000|         43|                15|         60000|
|   Joe|  40|      null| 50000|         40|                15|         50000|
|  Kyle|null|        60|  null|         40|                60|         20000|
|  null|  50|        30|  null|         50|                30|         20000|
|  null|null|        10|  null|         40|                10|         20000|
|  null|null|      null|  null|         40|                15|         20000|
|  null|null|      null|  null|         40|                15|  