In [2]:
### Pyspark Handling Missing Values


In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [7]:
df_pyspark=spark.read.csv('test2.csv',header=True,inferSchema=True)

In [8]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  25|         8| 25000|
|  Sandesh|  28|         4| 20000|
|    Sunny|  24|         3| 20000|
|     Paul|  21|         1| 15000|
|    Harsh|  23|         2| 18000|
|   Shuvam|NULL|      NULL| 40000|
|   Mahesh|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [18]:
### drop the columns
#  df_pyspark.drop('Name').show()

In [20]:
# df_pyspark.na.drop().show()

In [31]:
### how == all
df_pyspark.na.drop(how='all').show()


+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  25|         8| 25000|
|  Sandesh|  28|         4| 20000|
|    Sunny|  24|         3| 20000|
|     Paul|  21|         1| 15000|
|    Harsh|  23|         2| 18000|
|   Shuvam|NULL|      NULL| 40000|
|   Mahesh|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [33]:
## how == any
df_pyspark.na.drop(how='any').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 25|         8| 25000|
|  Sandesh| 28|         4| 20000|
|    Sunny| 24|         3| 20000|
|     Paul| 21|         1| 15000|
|    Harsh| 23|         2| 18000|
|   Mahesh| 34|        10| 38000|
+---------+---+----------+------+



In [34]:
## threshold
df_pyspark.na.drop(how='any',thresh=2).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  25|         8| 25000|
|  Sandesh|  28|         4| 20000|
|    Sunny|  24|         3| 20000|
|     Paul|  21|         1| 15000|
|    Harsh|  23|         2| 18000|
|   Shuvam|NULL|      NULL| 40000|
|   Mahesh|  34|        10| 38000|
+---------+----+----------+------+



In [35]:
## Subset
df_pyspark.na.drop(how='any',subset=['Experience']).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 25|         8| 25000|
|  Sandesh| 28|         4| 20000|
|    Sunny| 24|         3| 20000|
|     Paul| 21|         1| 15000|
|    Harsh| 23|         2| 18000|
|   Mahesh| 34|        10| 38000|
+---------+---+----------+------+



In [42]:
## Filling the missing Values
# df_pyspark.na.fill('Missing Values').show()
df_pyspark.na.fill('Missing Values','Name').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|         Krish|  31|        10| 30000|
|     Sudhanshu|  25|         8| 25000|
|       Sandesh|  28|         4| 20000|
|         Sunny|  24|         3| 20000|
|          Paul|  21|         1| 15000|
|         Harsh|  23|         2| 18000|
|        Shuvam|NULL|      NULL| 40000|
|        Mahesh|  34|        10| 38000|
|Missing Values|  36|      NULL|  NULL|
+--------------+----+----------+------+



In [44]:
## filling specific columns
df_pyspark.na.fill(22,['Experience','Age']).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 25|         8| 25000|
|  Sandesh| 28|         4| 20000|
|    Sunny| 24|         3| 20000|
|     Paul| 21|         1| 15000|
|    Harsh| 23|         2| 18000|
|   Shuvam| 22|        22| 40000|
|   Mahesh| 34|        10| 38000|
|     NULL| 36|        22|  NULL|
+---------+---+----------+------+



In [51]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
    outputCols=["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
    ).setStrategy("mean")

In [52]:
# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  25|         8| 25000|         25|                 8|         25000|
|  Sandesh|  28|         4| 20000|         28|                 4|         20000|
|    Sunny|  24|         3| 20000|         24|                 3|         20000|
|     Paul|  21|         1| 15000|         21|                 1|         15000|
|    Harsh|  23|         2| 18000|         23|                 2|         18000|
|   Shuvam|NULL|      NULL| 40000|         27|                 5|         40000|
|   Mahesh|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 5|         25750|
+---------+----+----------+-