In [35]:
from pyspark.sql import SparkSession

In [37]:
spark = SparkSession.builder.appName("practice").getOrCreate()

In [69]:
df = spark.read.csv("data.csv", header=True, inferSchema=True)

In [70]:
df.show(3)

+----+----+----------+-----------+
|city|year|weekofyear|total_cases|
+----+----+----------+-----------+
|  sj|1990|        18|          4|
|  sj|1990|        19|          5|
|  sj|1990|        20|       NULL|
+----+----+----------+-----------+
only showing top 3 rows



Drop row based on null values

In [13]:
df = df.na.drop()

In [14]:
df.show(3)

+----+----+----------+-----------+
|city|year|weekofyear|total_cases|
+----+----+----------+-----------+
|  sj|1990|        18|          4|
|  sj|1990|        19|          5|
|  sj|1990|        21|          3|
+----+----+----------+-----------+
only showing top 3 rows



In [15]:
# Drop those rows where all values are null
# df = df.na.drop(how="all")

# Drop those rows where any value is null
# df = df.na.drop(how="any")

# Drop those rows where at least 2 values are null
# df = df.na.drop(how="any", thresh=2)

Handle missing values

In [61]:
df = df.na.fill(404)

In [66]:
df.show(5)

+----+----+----------+-----------+
|city|year|weekofyear|total_cases|
+----+----+----------+-----------+
|  sj|1990|        18|          4|
|  sj|1990|        19|          5|
|  sj|1990|        20|        404|
|  sj|1990|        21|          3|
|  sj|1990|        22|          6|
+----+----+----------+-----------+
only showing top 5 rows



In [65]:
# Fill missing values only in specific columns
df = df.na.fill(404, 'total_cases')

In [71]:
# Handle missing values with mean of the column
from pyspark.ml.feature import Imputer

imputer = Imputer(strategy='mean', inputCols=['total_cases','weekofyear'], outputCols=['total_cases_imputed', 'weekofyear_imputed'])

In [72]:
imputer.fit(df).transform(df).show(5)

+----+----+----------+-----------+-------------------+------------------+
|city|year|weekofyear|total_cases|total_cases_imputed|weekofyear_imputed|
+----+----+----------+-----------+-------------------+------------------+
|  sj|1990|        18|          4|                  4|                18|
|  sj|1990|        19|          5|                  5|                19|
|  sj|1990|        20|       NULL|                 24|                20|
|  sj|1990|        21|          3|                  3|                21|
|  sj|1990|        22|          6|                  6|                22|
+----+----+----------+-----------+-------------------+------------------+
only showing top 5 rows

