In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Practice").getOrCreate()

In [2]:
spark

In [3]:
df_spark = spark.read.csv("test1.csv", header=True, inferSchema=True)
df_spark.show()

+-------+----+----------+-------+
|   Name| Age|Experience| Salary|
+-------+----+----------+-------+
| Faisal|  23|         2|1200000|
|  Imran|  28|         5|   null|
| Zishan|null|        10|1000000|
|Aayesha|  21|         1| 600000|
| Habiba|  20|         0| 500000|
|  Hajra|  24|      null| 450000|
| Nashra|   2|      null|   null|
| Zainab|null|      null|   null|
|   null|  36|        10|3500000|
|   null|  38|      null|   null|
+-------+----+----------+-------+



Dropping NULL values
1. By default the drop() will drop all the values if the row contains atleast 1 NULL values
2. drop() method has three parameters how, threshold, subset
3. By default how is set to any, -> this will remove all the entries even if we have a single NULL value
4. if we set how = 'all' -> it will drop that value where entire row has NULL value

drop(how, thresh, subset)

In [4]:
df_spark.na.drop().show()

+-------+---+----------+-------+
|   Name|Age|Experience| Salary|
+-------+---+----------+-------+
| Faisal| 23|         2|1200000|
|Aayesha| 21|         1| 600000|
| Habiba| 20|         0| 500000|
+-------+---+----------+-------+



In [5]:
df_spark.na.drop(how='all').show()

+-------+----+----------+-------+
|   Name| Age|Experience| Salary|
+-------+----+----------+-------+
| Faisal|  23|         2|1200000|
|  Imran|  28|         5|   null|
| Zishan|null|        10|1000000|
|Aayesha|  21|         1| 600000|
| Habiba|  20|         0| 500000|
|  Hajra|  24|      null| 450000|
| Nashra|   2|      null|   null|
| Zainab|null|      null|   null|
|   null|  36|        10|3500000|
|   null|  38|      null|   null|
+-------+----+----------+-------+



Threshold -> it says atleast how many number of NON-NULL values should be present in the row

In [6]:
df_spark.na.drop(thresh=2).show()

+-------+----+----------+-------+
|   Name| Age|Experience| Salary|
+-------+----+----------+-------+
| Faisal|  23|         2|1200000|
|  Imran|  28|         5|   null|
| Zishan|null|        10|1000000|
|Aayesha|  21|         1| 600000|
| Habiba|  20|         0| 500000|
|  Hajra|  24|      null| 450000|
| Nashra|   2|      null|   null|
|   null|  36|        10|3500000|
+-------+----+----------+-------+



Subset -> We can drop the NULL values from the specific columns using subset

In [7]:
df_spark.show()

+-------+----+----------+-------+
|   Name| Age|Experience| Salary|
+-------+----+----------+-------+
| Faisal|  23|         2|1200000|
|  Imran|  28|         5|   null|
| Zishan|null|        10|1000000|
|Aayesha|  21|         1| 600000|
| Habiba|  20|         0| 500000|
|  Hajra|  24|      null| 450000|
| Nashra|   2|      null|   null|
| Zainab|null|      null|   null|
|   null|  36|        10|3500000|
|   null|  38|      null|   null|
+-------+----+----------+-------+



In [8]:
df_spark.na.drop(subset=['Experience']).show()

+-------+----+----------+-------+
|   Name| Age|Experience| Salary|
+-------+----+----------+-------+
| Faisal|  23|         2|1200000|
|  Imran|  28|         5|   null|
| Zishan|null|        10|1000000|
|Aayesha|  21|         1| 600000|
| Habiba|  20|         0| 500000|
|   null|  36|        10|3500000|
+-------+----+----------+-------+



Replacing the missing values

In [11]:
df_spark.na.fill('Missing Values').show()

+--------------+----+----------+-------+
|          Name| Age|Experience| Salary|
+--------------+----+----------+-------+
|        Faisal|  23|         2|1200000|
|         Imran|  28|         5|   null|
|        Zishan|null|        10|1000000|
|       Aayesha|  21|         1| 600000|
|        Habiba|  20|         0| 500000|
|         Hajra|  24|      null| 450000|
|        Nashra|   2|      null|   null|
|        Zainab|null|      null|   null|
|Missing Values|  36|        10|3500000|
|Missing Values|  38|      null|   null|
+--------------+----+----------+-------+



Replacing the Null values with the Mean

In [16]:
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols=['Age', 'Experience', 'Salary'],
    outputCols=["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]
    ).setStrategy("mean")

#Add imputation cols of df
imputer.fit(df_spark).transform(df_spark).show()

+-------+----+----------+-------+-----------+------------------+--------------+
|   Name| Age|Experience| Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+-------+-----------+------------------+--------------+
| Faisal|  23|         2|1200000|         23|                 2|       1200000|
|  Imran|  28|         5|   null|         28|                 5|       1208333|
| Zishan|null|        10|1000000|         24|                10|       1000000|
|Aayesha|  21|         1| 600000|         21|                 1|        600000|
| Habiba|  20|         0| 500000|         20|                 0|        500000|
|  Hajra|  24|      null| 450000|         24|                 4|        450000|
| Nashra|   2|      null|   null|          2|                 4|       1208333|
| Zainab|null|      null|   null|         24|                 4|       1208333|
|   null|  36|        10|3500000|         36|                10|       3500000|
|   null|  38|      null|   null|       