- **Name:** 08_dataframe_union
- **Author:** Shamas Imran
- **Desciption:** Combining DataFrames using union operations
- **Date:** 19-Aug-2025
<!--
REVISION HISTORY
Version          Date        Author           Desciption
01           19-Aug-2025   Shamas Imran       Demonstrated union of two DataFrames  
                                              Showed schema alignment requirements  
                                              Highlighted union vs unionByName  
-->

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DatapurProgram").getOrCreate()

In [0]:
from pyspark.sql.types import *

student_ALevel_schema = StructType([
    StructField('ID', IntegerType(), False),
    StructField('Name', StringType(), True),
    StructField('Age', IntegerType(), True)
])

student_ALevel_data = [        
        (1, "Muhammad", 17), 
        (2, "Abdul Muqeet", 16),
        (3, "Musa", 15)         
        ]

df_student_ALevel = spark.createDataFrame(student_ALevel_data, student_ALevel_schema)
df_student_ALevel.show()


In [0]:
student_OLevel_schema = StructType([
    StructField('ID', IntegerType(), False),
    StructField('Name', StringType(), True),
    StructField('Age', IntegerType(), True)
    ])

student_OLevel_data = [ 
        (1, "Muhammad", 17),        
        (2, "Irfan", 18), 
        (3, "Shamas", 40), 
        (4, "Imran", 38)
        ]

df_student_OLevel = spark.createDataFrame(student_OLevel_data, student_OLevel_schema)
df_student_OLevel.show()

In [0]:
df_union = df_student_ALevel.union(df_student_OLevel)
df_union.show()

In [0]:
df1 = spark.createDataFrame(
    [(1, "Imran")],
    ["id", "name"]
)

df2 = spark.createDataFrame(
    [("Shamas", 2)],
    ["name", "id"]   # Notice reversed order
)

df_union = df1.unionByName(df2)
df_union.show()


In [0]:
df1 = spark.createDataFrame(
    [(1, "Shamas")],
    ["id", "name"]
)

df2 = spark.createDataFrame(
    [(2, "Imran", 25)],
    ["id", "name", "age"]
)
#df_union = df1.unionByName(df2) Error due to mismatching number of columns
df_union = df1.unionByName(df2, allowMissingColumns=True)
df_union.show()


In [0]:
df1 = spark.createDataFrame([(1, "Shamas")], ["id", "name"])     # id = int
df2 = spark.createDataFrame([("2", "Imran")], ["id", "name"])  # id = string

df_union = df1.unionByName(df2)
df_union.show()

In [0]:
from pyspark.sql.types import *
from datetime import datetime

# Convert DOB string to date object for df1
df1 = spark.createDataFrame(
    [
        (
            1,
            "Shamas",
            datetime.strptime("1986-11-10", "%Y-%m-%d").date()
        )
    ],
    schema=StructType([
        StructField("id", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("DOB", DateType(), True)
    ])
)

# df2 has DOB as StringType
df2 = spark.createDataFrame(
    [(2, "Imran", "19980721")], # Also try with 1998-07-21 and comment type conversion
    ["id", "name", "DOB"]
)

# Cast df1.DOB to DateType to match df2
df1 = df1.withColumn("DOB",df1["DOB"].cast(StringType()))

df_union = df1.unionByName(df2)
display(df_union)