In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive

# Mount Google Drive with a longer timeout
# drive.mount('/content/drive', force_remount=True, timeout_ms=300000)

# df_employee_data = "/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv"
# employeeSechema = StructType([
#     StructField("ID",IntegerType() ,True),
#     StructField("Name",StringType() ,True),
#     StructField("Age",IntegerType() ,True),
#     StructField("Salary",FloatType() ,True),
#     StructField("Joining_Date",DateType() ,True),
#     StructField("Department",StringType() ,True),
#     StructField("Performance_Rating",IntegerType() ,True),
#     StructField("Email",StringType() ,True),
#     StructField("Address",StringType() ,True),
#     StructField("Phone",StringType() ,True)

# ])
# # Load the DataFrame with the defined schema
# #df = spark.read.csv(path=df_employee_data, header=True, schema=employeeSechema)
# df = spark.read.load(path="/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv", format="csv", header = True, schema=employeeSechema)
# df.printSchema()
# df.show(50)

## Null Handling in Dataframe

In [15]:
# Sample data: sales data with nulls
data = [
      ("John", "North", 100, None),
      ("Doe", "East", None, 50),
      (None, "West", 150, 30),
      ("Alice", None, 200, 40),
      ("Bob", "South", None, None),
      (None, None, None, None)
  ]
columns = ["Name", "Region", "UnitsSold", "Revenue"]
# Create DataFrame
df = spark.createDataFrame(data, columns)
df.show()



+-----+------+---------+-------+
| Name|Region|UnitsSold|Revenue|
+-----+------+---------+-------+
| John| North|      100|   NULL|
|  Doe|  East|     NULL|     50|
| NULL|  West|      150|     30|
|Alice|  NULL|      200|     40|
|  Bob| South|     NULL|   NULL|
| NULL|  NULL|     NULL|   NULL|
+-----+------+---------+-------+



### 1. Detecting Null Values:
* The isNull() function identifies rows where a specified column has null values. The output shows a boolean flag for each row to indicate whether the value in the column is null.

In [16]:
# Detecting Null Values in the "Region" column
df.select("Name", "Region", isnull("Region").alias("is_Region_Null")).show()

+-----+------+--------------+
| Name|Region|is_Region_Null|
+-----+------+--------------+
| John| North|         false|
|  Doe|  East|         false|
| NULL|  West|         false|
|Alice|  NULL|          true|
|  Bob| South|         false|
| NULL|  NULL|          true|
+-----+------+--------------+



### 2. Dropping Rows with Null Values:
* dropna() removes rows that contain null values in any column when the default mode is used.
* Specifying "all" ensures rows are only removed if all columns contain null values.
* You can also apply null handling only on specific columns by providing a list of column names to the subset parameter.

In [18]:
# Dropping Rows with Null values (if any value in the row is null)
df2 = df.dropna()
df2.show()

+----+------+---------+-------+
|Name|Region|UnitsSold|Revenue|
+----+------+---------+-------+
+----+------+---------+-------+



In [17]:
#Dropping rows where all values are null
df3 = df.na.drop("all")
df3.show()

+-----+------+---------+-------+
| Name|Region|UnitsSold|Revenue|
+-----+------+---------+-------+
| John| North|      100|   NULL|
|  Doe|  East|     NULL|     50|
| NULL|  West|      150|     30|
|Alice|  NULL|      200|     40|
|  Bob| South|     NULL|   NULL|
+-----+------+---------+-------+



In [19]:
#Dropping Rows if null values in "Name" OR "Region" columns
df4 = df.na.drop("all", subset=["Name", "Region"])
df4.show()

+-----+------+---------+-------+
| Name|Region|UnitsSold|Revenue|
+-----+------+---------+-------+
| John| North|      100|   NULL|
|  Doe|  East|     NULL|     50|
| NULL|  West|      150|     30|
|Alice|  NULL|      200|     40|
|  Bob| South|     NULL|   NULL|
+-----+------+---------+-------+



### Filling Null Values:
  * fillna() allows replacing null values with specified replacements, either for all columns or selectively.
  * In the example, nulls in Region are replaced with "Unknown", while UnitsSold and Revenue nulls are filled with 0.

In [20]:
#Filling nullvalue with specific values
df5 = df.fillna({"Region": "Unknown", "UnitsSold": 0, "Revenue":0})
df5.show()

+-----+-------+---------+-------+
| Name| Region|UnitsSold|Revenue|
+-----+-------+---------+-------+
| John|  North|      100|      0|
|  Doe|   East|        0|     50|
| NULL|   West|      150|     30|
|Alice|Unknown|      200|     40|
|  Bob|  South|        0|      0|
| NULL|Unknown|        0|      0|
+-----+-------+---------+-------+



In [21]:
# Filling all null values in "Region" and "name" column
df6 = df.na.fill("N/A", subset=["Name", "Region"])
df6.show()


+-----+------+---------+-------+
| Name|Region|UnitsSold|Revenue|
+-----+------+---------+-------+
| John| North|      100|   NULL|
|  Doe|  East|     NULL|     50|
|  N/A|  West|      150|     30|
|Alice|   N/A|      200|     40|
|  Bob| South|     NULL|   NULL|
|  N/A|   N/A|     NULL|   NULL|
+-----+------+---------+-------+



### 4.Coalesce Function:
  * The coalesce() function returns the first non-null value in a list of columns. It’s useful when you need to handle missing data by providing alternative values from other columns.

In [22]:
# Using coalesce() to handle nulls by taking the first non-null value
df7 = df.withColumn("Adjust_UnitsSold", coalesce("UnitsSold", "Revenue"))
df7.show()

+-----+------+---------+-------+----------------+
| Name|Region|UnitsSold|Revenue|Adjust_UnitsSold|
+-----+------+---------+-------+----------------+
| John| North|      100|   NULL|             100|
|  Doe|  East|     NULL|     50|              50|
| NULL|  West|      150|     30|             150|
|Alice|  NULL|      200|     40|             200|
|  Bob| South|     NULL|   NULL|            NULL|
| NULL|  NULL|     NULL|   NULL|            NULL|
+-----+------+---------+-------+----------------+



### Handling Nulls in Aggregations:
* Null values can distort aggregate functions like mean(). Using coalesce() in an aggregation ensures that any null values are replaced with a default (e.g., 0.0) to avoid skewing the results.

In [23]:
df8 = df.groupBy("Region").agg(coalesce(mean("UnitsSold"),lit(0)).alias("Avg_UnitsSold"))
df8.show()

+------+-------------+
|Region|Avg_UnitsSold|
+------+-------------+
|  East|          0.0|
|  West|        150.0|
| North|        100.0|
|  NULL|        200.0|
| South|          0.0|
+------+-------------+

