In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, lower

# Step 1: Start Spark session
spark = SparkSession.builder.appName("Union_Intersection_Join_WordCount").getOrCreate()

# Step 2: Create sample DataFrames

# DataFrame 1: Employees and Departments
data1 = [
    ("John", "HR"),
    ("Alice", "Finance"),
    ("Bob", "IT"),
    ("Diana", "Marketing")
]
columns1 = ["name", "department"]
df1 = spark.createDataFrame(data1, columns1)

# DataFrame 2: Employees and Projects (some names overlap with df1)
data2 = [
    ("John", "Project A"),
    ("Alice", "Project B"),
    ("Eve", "Project C"),
    ("Frank", "Project D")
]
columns2 = ["name", "project"]
df2 = spark.createDataFrame(data2, columns2)

# DataFrame 3: Sentences for word count
data3 = [
    ("Spark is amazing",),
    ("I love working with Spark",),
    ("Spark makes big data easy",)
]
columns3 = ["sentence"]
df3 = spark.createDataFrame(data3, columns3)

# Show initial DataFrames
print("DataFrame 1:")
df1.show()
print("DataFrame 2:")
df2.show()
print("DataFrame 3:")
df3.show()

# Step 3: Perform Union (requires same schema)
# For demonstration, we'll make both have same schema
df1_renamed = df1.withColumnRenamed("department", "info")
df2_renamed = df2.withColumnRenamed("project", "info")

union_df = df1_renamed.union(df2_renamed)
print("Union of df1 and df2 (with same schema):")
union_df.show()

# Step 4: Perform Intersection (common rows only)
# Let's intersect by common schema and values
intersection_df = df1_renamed.intersect(df2_renamed)
print("Intersection of df1 and df2:")
intersection_df.show()

# Step 5: Perform Join on "name"
join_df = df1.join(df2, on="name", how="inner")
print("Join of df1 and df2 on 'name':")
join_df.show()

# Step 6: Word Count from sentences
words_df = df3.select(explode(split(lower(col("sentence")), " ")).alias("word"))
word_count = words_df.groupBy("word").count().orderBy("count", ascending=False)
print("Word Count from sentences:")
word_count.show()

# Stop Spark session
spark.stop()


DataFrame 1:
+-----+----------+
| name|department|
+-----+----------+
| John|        HR|
|Alice|   Finance|
|  Bob|        IT|
|Diana| Marketing|
+-----+----------+

DataFrame 2:
+-----+---------+
| name|  project|
+-----+---------+
| John|Project A|
|Alice|Project B|
|  Eve|Project C|
|Frank|Project D|
+-----+---------+

DataFrame 3:
+--------------------+
|            sentence|
+--------------------+
|    Spark is amazing|
|I love working wi...|
|Spark makes big d...|
+--------------------+

Union of df1 and df2 (with same schema):
+-----+---------+
| name|     info|
+-----+---------+
| John|       HR|
|Alice|  Finance|
|  Bob|       IT|
|Diana|Marketing|
| John|Project A|
|Alice|Project B|
|  Eve|Project C|
|Frank|Project D|
+-----+---------+

Intersection of df1 and df2:
+----+----+
|name|info|
+----+----+
+----+----+

Join of df1 and df2 on 'name':
+-----+----------+---------+
| name|department|  project|
+-----+----------+---------+
|Alice|   Finance|Project B|
| John|        HR|