In [None]:
"""
Processing ETL logs from multiple clients. Each client uploads files daily, and for each file, the log tracks:
client_id
file_name
check_sum (represents content)
timestamp

Your goal is to identify clients who are regular, defined as:
They uploaded at least two unique (check_sum) entries on both days.
The set of (check_sum) from Day 1 and Day 2 should have at least 2 unique entries that differ across the two days.

Scenario:
File name same, content same
File name same, content different
File name different, content same
File name different, content different
"""

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("ETLClientContent").getOrCreate()

25/07/14 09:35:13 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
day1_data = [
    ("C1", "file1.csv", "cs1", "2025-07-13"),
    ("C1", "file2.csv", "cs2", "2025-07-13"),
    ("C1", "file3.csv", "cs3", "2025-07-13"),
    ("C2", "report.csv", "cs4", "2025-07-13"),
    ("C2", "summary.csv", "cs5", "2025-07-13"),
    ("C3", "upload1.csv", "cs6", "2025-07-13"),
    ("C3", "upload2.csv", "cs7", "2025-07-13"),
    ("C4", "a.csv", "cs8", "2025-07-13"),
    ("C4", "b.csv", "cs9", "2025-07-13"),
    ("C5", "test1.csv", "cs10", "2025-07-13"),
    ("C5", "test2.csv", "cs11", "2025-07-13")
]

day2_data = [
    ("C1", "file1.csv", "cs1", "2025-07-14"),
    ("C1", "file4.csv", "cs12", "2025-07-14"),
    ("C2", "report.csv", "cs13", "2025-07-14"),
    ("C2", "summary.csv", "cs14", "2025-07-14"),
    ("C3", "new1.csv", "cs6", "2025-07-14"),
    ("C3", "new2.csv", "cs7", "2025-07-14"),
    ("C4", "c.csv", "cs15", "2025-07-14"),
    ("C4", "d.csv", "cs16", "2025-07-14"),
    ("C5", "test3.csv", "cs17", "2025-07-14"),
    ("C6", "z1.csv", "cs18", "2025-07-14"),
    ("C6", "z2.csv", "cs19", "2025-07-14")
]



In [26]:
columns = ["client_id", "file_name", "check_sum", "timestamp"]

# Create day1 and day2 DataFrames
df_day1 = spark.createDataFrame(day1_data, columns).withColumn("day", lit("day1"))
df_day2 = spark.createDataFrame(day2_data, columns).withColumn("day", lit("day2"))

combined_df = df_day1.unionByName(df_day2)

combined_df.show(truncate=False)

+---------+-----------+---------+----------+----+
|client_id|file_name  |check_sum|timestamp |day |
+---------+-----------+---------+----------+----+
|C1       |file1.csv  |cs1      |2025-07-13|day1|
|C1       |file2.csv  |cs2      |2025-07-13|day1|
|C1       |file3.csv  |cs3      |2025-07-13|day1|
|C2       |report.csv |cs4      |2025-07-13|day1|
|C2       |summary.csv|cs5      |2025-07-13|day1|
|C3       |upload1.csv|cs6      |2025-07-13|day1|
|C3       |upload2.csv|cs7      |2025-07-13|day1|
|C4       |a.csv      |cs8      |2025-07-13|day1|
|C4       |b.csv      |cs9      |2025-07-13|day1|
|C5       |test1.csv  |cs10     |2025-07-13|day1|
|C5       |test2.csv  |cs11     |2025-07-13|day1|
|C1       |file1.csv  |cs1      |2025-07-14|day2|
|C1       |file4.csv  |cs12     |2025-07-14|day2|
|C2       |report.csv |cs13     |2025-07-14|day2|
|C2       |summary.csv|cs14     |2025-07-14|day2|
|C3       |new1.csv   |cs6      |2025-07-14|day2|
|C3       |new2.csv   |cs7      |2025-07-14|day2|


In [28]:
# 2. Pivot by day and collect check_sums as arrays
grouped_df = (
    combined_df
    .groupBy("client_id")
    .pivot("day", ["day1", "day2"])
    .agg(collect_list("check_sum").alias("check_sums"))
)

grouped_df.show(truncate=False)

+---------+---------------+------------+
|client_id|day1           |day2        |
+---------+---------------+------------+
|C1       |[cs1, cs2, cs3]|[cs1, cs12] |
|C2       |[cs4, cs5]     |[cs13, cs14]|
|C3       |[cs6, cs7]     |[cs6, cs7]  |
|C4       |[cs8, cs9]     |[cs15, cs16]|
|C5       |[cs10, cs11]   |[cs17]      |
|C6       |[]             |[cs18, cs19]|
+---------+---------------+------------+



In [29]:
# 3. Filter for clients who have at least 2 check_sums on both days
regular_candidates = (
    grouped_df
    .filter(
        (size(col("day1")) >= 2) &
        (size(col("day2")) >= 2)
    )
)

regular_candidates.show(truncate=False)

+---------+---------------+------------+
|client_id|day1           |day2        |
+---------+---------------+------------+
|C1       |[cs1, cs2, cs3]|[cs1, cs12] |
|C2       |[cs4, cs5]     |[cs13, cs14]|
|C3       |[cs6, cs7]     |[cs6, cs7]  |
|C4       |[cs8, cs9]     |[cs15, cs16]|
+---------+---------------+------------+



In [34]:
# 4. Identify differences and regular clients
regular_clients = (
    regular_candidates
    .withColumn("diff", array_except(col("day2"), col("day1")))
    .withColumn("is_regular", when(size(col("diff")) >= 2, True).otherwise(False))
)

regular_clients.show(truncate=False)

+---------+---------------+------------+------------+----------+
|client_id|day1           |day2        |diff        |is_regular|
+---------+---------------+------------+------------+----------+
|C1       |[cs1, cs2, cs3]|[cs1, cs12] |[cs12]      |false     |
|C2       |[cs4, cs5]     |[cs13, cs14]|[cs13, cs14]|true      |
|C3       |[cs6, cs7]     |[cs6, cs7]  |[]          |false     |
|C4       |[cs8, cs9]     |[cs15, cs16]|[cs15, cs16]|true      |
+---------+---------------+------------+------------+----------+



In [32]:
# 5. final result
regular_clients.select("client_id", "day1", "day2", "diff", "is_regular").show(truncate=False)

+---------+---------------+------------+------------+----------+
|client_id|day1           |day2        |diff        |is_regular|
+---------+---------------+------------+------------+----------+
|C1       |[cs1, cs2, cs3]|[cs1, cs12] |[cs12]      |false     |
|C2       |[cs4, cs5]     |[cs13, cs14]|[cs13, cs14]|true      |
|C3       |[cs6, cs7]     |[cs6, cs7]  |[]          |false     |
|C4       |[cs8, cs9]     |[cs15, cs16]|[cs15, cs16]|true      |
+---------+---------------+------------+------------+----------+

