In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, count, sum, avg, from_unixtime, to_timestamp, get_json_object, trim, explode, first, from_json
from pyspark.sql.types import StructType, StringType, MapType, StringType
from pyspark.sql.window import Window

spark = SparkSession.builder \
    .appName("CafeRewardsDataPipeline") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()


customers_file = "/Users/rafaelcamara/dev/ballastlane/archive/customers.csv"
offers_file = "/Users/rafaelcamara/dev/ballastlane/archive/offers.csv"
events_file = "/Users/rafaelcamara/dev/ballastlane/archive/events.csv"

print("Ingesting raw data...")
customers_df = spark.read.csv(customers_file, header=True, inferSchema=True)
offers_df = spark.read.csv(offers_file, header=True, inferSchema=True)
raw_events_df = spark.read.csv(events_file, header=True, inferSchema=True)


# Identify if the offer was completed
events_df = raw_events_df.withColumn(
    "offer_completed",
    when(trim(col("event")) == "offer completed", lit(1)).otherwise(lit(0))
)

# flatten the JSON structure in the "value" column
# Explode the map to key-value pairs

# Parse the JSON string into a Map
value_map_df = events_df.withColumn("value_map", from_json(col("value"), MapType(StringType(), StringType())))

exploded_df = value_map_df.select(
    "customer_id", 
    "event", 
    explode("value_map").alias("key", "val"),
    "time",
    "offer_completed"
)

pivoted_df = exploded_df.groupBy("customer_id", "event", "time","offer_completed") \
    .pivot("key", ["offer_id", "reward", "amount"]) \
    .agg(first("val"))


events_df = pivoted_df.withColumn("event_timestamp_readable", 
                                             from_unixtime(col("time")))
events_df = events_df.withColumn("event_datetime", 
                                             to_timestamp(col("event_timestamp_readable")))

# Join dataframes
# Join events with offers to get offer details for each event
events_with_offers = offers_df.join(events_df, "offer_id", "left")


customer_with_events = customers_df.join(events_df, "customer_id", "left")

# Join with customers to get customer demographics
full_df = customers_df.join(events_with_offers, "customer_id", "left")

# Analytical Questions
print("\n--- Analytical Results ---")

# 1. Total number of customers
total_customers = customers_df.count()
print(f"Total number of customers: {total_customers}")

# 2. Total number of offers
total_offers = offers_df.count()
print(f"Total number of offers: {total_offers}")

# 3. Total number of events
total_events = events_df.count()
print(f"Total number of events: {total_events}")

# 4. Number of unique customers who made a purchase
unique_purchasing_customers = full_df.filter(col("event") == "transaction") \
                                     .select("customer_id").distinct().count()
print(f"Number of unique customers who made a purchase: {unique_purchasing_customers}")

# 5. Top 5 most popular offers (by number of views or purchases)
# Assuming popularity is based on purchases within the offer period
top_5_offers = full_df.filter(col("offer_completed") == 1) \
                      .groupBy("offer_id", "offer_type") \
                      .agg(count("offer_id").alias("purchase_count")) \
                      .orderBy(col("purchase_count").desc()) \
                      .limit(5)
print("\nTop 5 most popular offers (by number of completed offers):")
top_5_offers.show(truncate=False)

# 6. Average transaction amount
# Filter for purchase events only, extract "amount" from JSON in "value"
average_transaction_amount = events_df.filter(col("event") == "transaction") \
    .agg(avg(col("amount").cast("double"))).collect()[0][0]
    #.agg(avg(get_json_object(col("value"), "$.amount").cast("double"))).collect()[0][0]
print(f"Average transaction amount: {average_transaction_amount:.2f}")

# 7. Number of offers completed vs. not completed
completion_window = Window.partitionBy("offer_completed")

offers_completion_status= events_df.select(
    "offer_completed",
    count("*").over(completion_window).alias("completion_count")
).dropDuplicates()
"""
offers_completion_status = events_df.groupBy("offer_completed") \
                                   .agg(count("offer_id").alias("count"))
"""                                   
print("\nNumber of offers completed vs. not completed:")
offers_completion_status.show()

# 8. Distribution of customers by age group
# Assuming age is available in customers_df and is numerical
# For simplicity, let\'s define some age groups
customers_df = customers_df.withColumn("age_group", 
                                     when(col("age") < 25, "<25") \
                                     .when((col("age") >= 25) & (col("age") < 35), "25-34") \
                                     .when((col("age") >= 35) & (col("age") < 45), "35-44") \
                                     .when((col("age") >= 45) & (col("age") < 55), "45-54") \
                                     .when((col("age") >= 55) & (col("age") < 65), "55-64") \
                                     .otherwise("65+"))

age_distribution = customers_df.groupBy("age_group").agg(count("customer_id").alias("customer_count")) \
                               .orderBy("age_group")
print("\nCustomer distribution by age group:")
age_distribution.show()

# 9. Offers with highest completion rate
# Calculate total offers viewed/received and completed offers per offer_id
offer_summary = full_df.groupBy("offer_id", "offer_type") \
                         .agg(count("offer_id").alias("total_events"), 
                              sum(col("offer_completed")).alias("completed_offers"))

offer_completion_rate = offer_summary.withColumn(
    "completion_rate",
    when(col("total_events") != 0, (col("completed_offers") / col("total_events")) * 100).otherwise(None)
)

highest_completion_rate_offers = offer_completion_rate.orderBy(col("completion_rate").desc()) \
                                                       .limit(5)
print("\nOffers with highest completion rate:")
highest_completion_rate_offers.show(truncate=False)

# 10. Customer lifetime value (CLV) - simplified: total transaction amount per customer
customer_clv = customer_with_events.filter(col("event") == "transaction") \
                       .groupBy("customer_id") \
                       .agg(sum(col("amount").cast("double")).alias("total_spent")) \
                       .orderBy(col("total_spent").desc())

print("\nTop 10 Customers by Lifetime Value (Total Spent):")
customer_clv.limit(10).show()

#spark.stop()

Ingesting raw data...

--- Analytical Results ---
Total number of customers: 17000
Total number of offers: 10


                                                                                

Total number of events: 303572


                                                                                

Number of unique customers who made a purchase: 0

Top 5 most popular offers (by number of completed offers):


                                                                                

+--------------------------------+----------+--------------+
|offer_id                        |offer_type|purchase_count|
+--------------------------------+----------+--------------+
|fafdcd668e3743c1bb461111dcafc2a4|discount  |4927          |
|2298d6c36e964ae4a3e7e9706d1fb8c2|discount  |4698          |
|9b98b8c7a33c4b65b9aebfe6a799e6d9|bogo      |3926          |
|f19421c1d4aa40978ebb69ca19b0e20d|bogo      |3877          |
|2906b810c7d4411798c6938adc9daaa5|discount  |3642          |
+--------------------------------+----------+--------------+



                                                                                

Average transaction amount: 12.78

Number of offers completed vs. not completed:


                                                                                

+---------------+----------------+
|offer_completed|completion_count|
+---------------+----------------+
|              0|          272955|
|              1|           30617|
+---------------+----------------+


Customer distribution by age group:
+---------+--------------+
|age_group|customer_count|
+---------+--------------+
|    25-34|          1380|
|    35-44|          1869|
|    45-54|          3013|
|    55-64|          3421|
|      65+|          6441|
|      <25|           876|
+---------+--------------+


Offers with highest completion rate:


                                                                                

+--------------------------------+----------+------------+----------------+---------------+
|offer_id                        |offer_type|total_events|completed_offers|completion_rate|
+--------------------------------+----------+------------+----------------+---------------+
|4d5c57ea9a6940dd891ad53e9dbe8da0|bogo      |2986        |2986            |100.0          |
|9b98b8c7a33c4b65b9aebfe6a799e6d9|bogo      |3926        |3926            |100.0          |
|2298d6c36e964ae4a3e7e9706d1fb8c2|discount  |4698        |4698            |100.0          |
|2906b810c7d4411798c6938adc9daaa5|discount  |3642        |3642            |100.0          |
|fafdcd668e3743c1bb461111dcafc2a4|discount  |4927        |4927            |100.0          |
+--------------------------------+----------+------------+----------------+---------------+


Top 10 Customers by Lifetime Value (Total Spent):


[Stage 64:>                                                         (0 + 6) / 6]

+--------------------+------------------+
|         customer_id|       total_spent|
+--------------------+------------------+
|3c8d541112a74af99...|           1608.69|
|f1d65ae63f174b8f8...|           1365.66|
|ae6f43089b674728a...|           1327.74|
|626df8678e2a4953b...|           1321.42|
|73afdeca19e349b98...|           1319.97|
|52959f19113e4241a...|1292.8600000000001|
|ad1f0a409ae642bc9...|           1258.19|
|d240308de0ee4cf8b...|           1251.99|
|946fc0d3ecc4492aa...|            1232.4|
|6406abad8e2c4b858...|           1211.76|
+--------------------+------------------+



                                                                                

1. Which marketing channel is the most effective in terms of offer completion rate?

In [None]:
from pyspark.sql.functions import from_json, explode, regexp_replace

# Parse channels JSON string to array
channels_schema = ArrayType(StringType())
offers_df = offers_df.withColumn("channels_clean", regexp_replace(col("channels"), "'", '"'))
offers_df = offers_df.withColumn("channels_array", from_json(col("channels_clean"), channels_schema))

# Explode channels into individual rows
offers_exploded = offers_df.withColumn("channel", explode("channels_array"))

# Join exploded channels with events that include offer_id
full_df_with_channels = full_df.join(
    offers_exploded.select("offer_id", "channel"),
    on="offer_id",
    how="left"
)

# Calculate completion rate per channel
channel_completion = full_df_with_channels.groupBy("channel") \
    .agg(
        count("offer_id").alias("total_events"),
        sum("offer_completed").alias("completed_offers")
    ) \
    .withColumn("completion_rate", (col("completed_offers") / col("total_events")) * 100)

print("Completion rate by marketing channel:")
channel_completion.orderBy("completion_rate", ascending=False).show(truncate=False)

🔹 Completion rate by marketing channel:


                                                                                

+-------+------------+----------------+---------------+
|channel|total_events|completed_offers|completion_rate|
+-------+------------+----------------+---------------+
|mobile |27378       |27378           |100.0          |
|email  |30617       |30617           |100.0          |
|social |19810       |19810           |100.0          |
|web    |27295       |27295           |100.0          |
|NULL   |0           |NULL            |NULL           |
+-------+------------+----------------+---------------+



2. How is the age distribution of customers who completed offers vs. those who did not?  

In [None]:
customer_with_events = customer_with_events.withColumn("age_group", 
    when(col("age") < 25, "<25")
    .when((col("age") >= 25) & (col("age") < 35), "25-34")
    .when((col("age") >= 35) & (col("age") < 45), "35-44")
    .when((col("age") >= 45) & (col("age") < 55), "45-54")
    .when((col("age") >= 55) & (col("age") < 65), "55-64")
    .otherwise("65+")
)

age_dist = customer_with_events.groupBy("age_group", "offer_completed") \
    .agg(count("*").alias("count")) \
    .orderBy("age_group", "offer_completed")

print("Age distribution by offer completion status:")
age_dist.show()

🔹 Age distribution by offer completion status:


[Stage 85:>                                                         (0 + 8) / 8]

+---------+---------------+-----+
|age_group|offer_completed|count|
+---------+---------------+-----+
|    25-34|              0|24785|
|    25-34|              1| 2302|
|    35-44|              0|32213|
|    35-44|              1| 3590|
|    45-54|              0|47646|
|    45-54|              1| 6121|
|    55-64|              0|53487|
|    55-64|              1| 7312|
|      65+|              0|99236|
|      65+|              1| 9946|
|      <25|              0|15588|
|      <25|              1| 1346|
+---------+---------------+-----+



                                                                                

In [None]:
3. What is the average time taken by customers to complete an offer after receiving it?

In [None]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType
from pyspark.sql.functions import from_unixtime, to_timestamp
from pyspark.sql.functions import avg


value_schema = StructType().add("offer_id", StringType())

events_df = events_df.withColumn("event_timestamp_readable", from_unixtime(col("time")))
events_df = events_df.withColumn("event_datetime", to_timestamp(col("event_timestamp_readable")))

received_df = events_df.filter(col("event") == "offer received") \
    .select("customer_id", "offer_id", col("event_datetime").alias("received_time"))

completed_df = events_df.filter(col("event") == "offer completed") \
    .select("customer_id", "offer_id", col("event_datetime").alias("completed_time"))

offer_timings = received_df.join(completed_df, ["customer_id", "offer_id"], "inner") \
    .withColumn("completion_duration_hours",
                (col("completed_time").cast("long") - col("received_time").cast("long")) / 3600)

result = offer_timings.agg(avg("completion_duration_hours")).collect()[0][0]

if result is not None:
    print(f" Average time taken to complete an offer: {result:.2f} hours")
else:
    print(" No matches found between offer received and completed events.")

[Stage 129:>                                                        (0 + 7) / 7]

 No matches found between offer received and completed events.


                                                                                