# Ex-2110 Nulls


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, when, avg, coalesce

# Initialize Spark session
spark = SparkSession.builder.appName("ParcelProcessing").getOrCreate()

# Define schema for parcels
schema = StructType([
    StructField("Parcel_ID", IntegerType(), True),
    StructField("Destination_Address", StringType(), True),
    StructField("Weight_kg", IntegerType(), True),
    StructField("Volume_l", IntegerType(), True)
])

# Sample data with null values
data = [
    (101, "123 Main St, New York, USA", 5, None),
    (102, None, None, 50),
    (103, "456 Elm St, London, UK", 8, 30),
    (104, "789 Oak St, Berlin, Germany", None, None),
    (105, "321 Pine St, Toronto, Canada", 12, 60),
    (106, None, 13, None)  # Invalid weight
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

df.show()

+---------+--------------------+---------+--------+
|Parcel_ID| Destination_Address|Weight_kg|Volume_l|
+---------+--------------------+---------+--------+
|      101|123 Main St, New ...|        5|    NULL|
|      102|                NULL|     NULL|      50|
|      103|456 Elm St, Londo...|        8|      30|
|      104|789 Oak St, Berli...|     NULL|    NULL|
|      105|321 Pine St, Toro...|       12|      60|
|      106|                NULL|       13|    NULL|
+---------+--------------------+---------+--------+



In [2]:
# Usunięcie wierszy, gdzie `Destination_Address` jest NULL
df_filtered = df.dropna(subset=["Destination_Address"])
df_filtered.show()


+---------+--------------------+---------+--------+
|Parcel_ID| Destination_Address|Weight_kg|Volume_l|
+---------+--------------------+---------+--------+
|      101|123 Main St, New ...|        5|    NULL|
|      103|456 Elm St, Londo...|        8|      30|
|      104|789 Oak St, Berli...|     NULL|    NULL|
|      105|321 Pine St, Toro...|       12|      60|
+---------+--------------------+---------+--------+



In [3]:
# Usunięcie wierszy, jeśli którakolwiek kolumna ma NULL
df_no_nulls = df.dropna(how="any")
df_no_nulls.show()

+---------+--------------------+---------+--------+
|Parcel_ID| Destination_Address|Weight_kg|Volume_l|
+---------+--------------------+---------+--------+
|      103|456 Elm St, Londo...|        8|      30|
|      105|321 Pine St, Toro...|       12|      60|
+---------+--------------------+---------+--------+



In [4]:
df_filled = df.fillna({"Weight_kg": -1, "Volume_l": 0, "Destination_Address": "UNKNOWN"})
df_filled.show()

+---------+--------------------+---------+--------+
|Parcel_ID| Destination_Address|Weight_kg|Volume_l|
+---------+--------------------+---------+--------+
|      101|123 Main St, New ...|        5|       0|
|      102|             UNKNOWN|       -1|      50|
|      103|456 Elm St, Londo...|        8|      30|
|      104|789 Oak St, Berli...|       -1|       0|
|      105|321 Pine St, Toro...|       12|      60|
|      106|             UNKNOWN|       13|       0|
+---------+--------------------+---------+--------+



In [5]:
mean_weight = df.select(avg("Weight_kg").alias("avg_weight")).collect()[0]["avg_weight"]
df_avg_weight = df.fillna({"Weight_kg": int(mean_weight)})
df_avg_weight.show()

+---------+--------------------+---------+--------+
|Parcel_ID| Destination_Address|Weight_kg|Volume_l|
+---------+--------------------+---------+--------+
|      101|123 Main St, New ...|        5|    NULL|
|      102|                NULL|        9|      50|
|      103|456 Elm St, Londo...|        8|      30|
|      104|789 Oak St, Berli...|        9|    NULL|
|      105|321 Pine St, Toro...|       12|      60|
|      106|                NULL|       13|    NULL|
+---------+--------------------+---------+--------+



In [6]:
mean_volume = df.select(avg("Volume_l").alias("avg_volume")).collect()[0]["avg_volume"]
df_avg_volume = df.fillna({"Volume_l": int(mean_volume)})
df_avg_volume.show()

+---------+--------------------+---------+--------+
|Parcel_ID| Destination_Address|Weight_kg|Volume_l|
+---------+--------------------+---------+--------+
|      101|123 Main St, New ...|        5|      46|
|      102|                NULL|     NULL|      50|
|      103|456 Elm St, Londo...|        8|      30|
|      104|789 Oak St, Berli...|     NULL|      46|
|      105|321 Pine St, Toro...|       12|      60|
|      106|                NULL|       13|      46|
+---------+--------------------+---------+--------+



In [7]:
from pyspark.sql.functions import when, lit

df_pay_for = df.withColumn(
    "pay_for",
    when(col("Weight_kg").isNotNull(), "weight")
    .when(col("Volume_l").isNotNull(), "volume")
    .otherwise("error")
)

df_pay_for.show()


+---------+--------------------+---------+--------+-------+
|Parcel_ID| Destination_Address|Weight_kg|Volume_l|pay_for|
+---------+--------------------+---------+--------+-------+
|      101|123 Main St, New ...|        5|    NULL| weight|
|      102|                NULL|     NULL|      50| volume|
|      103|456 Elm St, Londo...|        8|      30| weight|
|      104|789 Oak St, Berli...|     NULL|    NULL|  error|
|      105|321 Pine St, Toro...|       12|      60| weight|
|      106|                NULL|       13|    NULL| weight|
+---------+--------------------+---------+--------+-------+

