# Ex-2120 Union

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import lit

# Initialize Spark session
spark = SparkSession.builder.appName("FleetMerge").getOrCreate()

# Define schema for `cars_df` (Taxi fleet)
cars_schema = StructType([
    StructField("VehicleID", StringType(), True),  # Renaming CarId to VehicleID for consistency
    StructField("Make", StringType(), True),
    StructField("ProductionYear", IntegerType(), True),
    StructField("Mileage_km", IntegerType(), True)  # Keeping Mileage specific to taxis
])

# Define schema for `truck_df` (Truck fleet)
truck_schema = StructType([
    StructField("VehicleID", StringType(), True),  # Renaming TruckId to VehicleID for consistency
    StructField("Make", StringType(), True),
    StructField("ProductionYear", IntegerType(), True),
    StructField("Tonnage", IntegerType(), True)  # Keeping Tonnage specific to trucks
])

# Sample data for cars
cars_data = [
    ("C001", "Toyota", 2018, 150000),
    ("C002", "Ford", 2016, 210000),
    ("C003", "Honda", 2019, 110000),
]

# Sample data for trucks
truck_data = [
    ("T001", "Volvo", 2015, 12),
    ("T002", "Mercedes", 2017, 10),
    ("T003", "MAN", 2020, 15),
]

# Create DataFrames
cars_df = spark.createDataFrame(cars_data, schema=cars_schema)
truck_df = spark.createDataFrame(truck_data, schema=truck_schema)

# Show initial DataFrames
cars_df.show()
truck_df.show()

+---------+------+--------------+----------+
|VehicleID|  Make|ProductionYear|Mileage_km|
+---------+------+--------------+----------+
|     C001|Toyota|          2018|    150000|
|     C002|  Ford|          2016|    210000|
|     C003| Honda|          2019|    110000|
+---------+------+--------------+----------+

+---------+--------+--------------+-------+
|VehicleID|    Make|ProductionYear|Tonnage|
+---------+--------+--------------+-------+
|     T001|   Volvo|          2015|     12|
|     T002|Mercedes|          2017|     10|
|     T003|     MAN|          2020|     15|
+---------+--------+--------------+-------+



In [7]:
# Add missing columns with NULLs in both DataFrames
cars_df = cars_df.withColumn("Tonnage", lit(None))  # Cars don't have tonnage
truck_df = truck_df.withColumn("Mileage_km", lit(None))  # Trucks don't have mileage

# Merge both fleets into one DataFrame
fleet_df = cars_df.unionByName(truck_df)

# Show the merged fleet DataFrame
fleet_df.show()

+---------+--------+--------------+----------+-------+
|VehicleID|    Make|ProductionYear|Mileage_km|Tonnage|
+---------+--------+--------------+----------+-------+
|     C001|  Toyota|          2018|    150000|   NULL|
|     C002|    Ford|          2016|    210000|   NULL|
|     C003|   Honda|          2019|    110000|   NULL|
|     T001|   Volvo|          2015|      NULL|     12|
|     T002|Mercedes|          2017|      NULL|     10|
|     T003|     MAN|          2020|      NULL|     15|
+---------+--------+--------------+----------+-------+

