##### Find total visits made by each person, the most frequently visited floor, and all the resources used by that person.

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [0]:
data = [
    ('A', 'Bangalore', 'A@gmail.com', 1, 'CPU'),
    ('A', 'Bangalore', 'A1@gmail.com', 1, 'CPU'),
    ('A', 'Bangalore', 'A2@gmail.com', 2, 'DESKTOP'),
    ('B', 'Bangalore', 'B@gmail.com', 2, 'DESKTOP'),
    ('B', 'Bangalore', 'B1@gmail.com', 2, 'DESKTOP'),
    ('B', 'Bangalore', 'B2@gmail.com', 1, 'MONITOR')
]

schema = "name string, address string, email string, floor int, resources string"

df = spark.createDataFrame(data, schema)

In [0]:
# Calculate_total_visits_and_concatenate_resources
df1 = df.groupBy("name") \
    .agg(
        count("floor").alias("total_visits"),
        concat_ws(',', sort_array(collect_set("resources"))).alias("resources")
    )

In [0]:
# Identify_the_most_frequently_visited_floor
df_temp = (
    df.groupBy("name", "floor")
    .agg(count("floor").alias("floor_count"))
)

In [0]:
window = Window.partitionBy("name").orderBy(col("floor_count").desc())
df2 = (
    df_temp.withColumn("rn", row_number().over(window))
    .filter("rn == 1")
    .select("name", col("floor").alias("frequent_visited_floor"))
)

In [0]:
# Join_the_results
df_final = df1.join(df2, "name")

df_final.display()

name,total_visits,resources,frequent_visited_floor
A,3,"CPU,DESKTOP",1
B,3,"DESKTOP,MONITOR",2
