In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("file_read").getOrCreate()

data = [
    {"emp_id": 1, "name": "Arun",   "department": "IT",     "project": "Billing",      "skill": "Python"},
    {"emp_id": 1, "name": "Arun",   "department": "IT",     "project": "Billing",      "skill": "SQL"},
    {"emp_id": 2, "name": "Meena",  "department": "HR",     "project": "Recruitment",  "skill": "Excel"},
    {"emp_id": 3, "name": "Ravi",   "department": "IT",     "project": "Migration",    "skill": "Java"},
    {"emp_id": 3, "name": "Ravi",   "department": "IT",     "project": "Migration",    "skill": "Python"},
    {"emp_id": 4, "name": "Kiran",  "department": "Finance","project": "Audit",        "skill": "Tally"},
    {"emp_id": 5, "name": "Sita",   "department": "HR",     "project": "Training",     "skill": "Presentation"},
    {"emp_id": 6, "name": "Ajay",   "department": "IT",     "project": "Support",      "skill": "Linux"},
    {"emp_id": 6, "name": "Ajay",   "department": "IT",     "project": "Support",      "skill": "Shell"}
]

df = spark.createDataFrame(data)


#### collect list

In [3]:
df.groupBy("department").agg(collect_list("skill").alias("Skill_set")).show(truncate=False)

+----------+-----------------------------------------+
|department|Skill_set                                |
+----------+-----------------------------------------+
|IT        |[Python, SQL, Java, Python, Linux, Shell]|
|HR        |[Excel, Presentation]                    |
|Finance   |[Tally]                                  |
+----------+-----------------------------------------+



#### collect set ->Collect unique (collect_set)

In [4]:
df.groupBy("department").agg(collect_set('project').alias("pro")).show(truncate=False)

+----------+-----------------------------+
|department|pro                          |
+----------+-----------------------------+
|IT        |[Billing, Support, Migration]|
|HR        |[Training, Recruitment]      |
|Finance   |[Audit]                      |
+----------+-----------------------------+



#### Create nested struct + collect_list

In [5]:
df.select("department", struct("name", "skill","project").alias("emp_info")) \
  .groupBy("department") \
  .agg(collect_list("emp_info").alias("employees")) \
  .show(truncate=False)


+----------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|department|employees                                                                                                                                          |
+----------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|IT        |[{Arun, Python, Billing}, {Arun, SQL, Billing}, {Ravi, Java, Migration}, {Ravi, Python, Migration}, {Ajay, Linux, Support}, {Ajay, Shell, Support}]|
|HR        |[{Meena, Excel, Recruitment}, {Sita, Presentation, Training}]                                                                                      |
|Finance   |[{Kiran, Tally, Audit}]                                                                                                                            |
+----------+----------------------