In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, count, sum, avg, from_unixtime, to_timestamp, datediff, get_json_object, from_json,trim
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import StructType, StringType

spark = SparkSession.builder \
    .appName("CafeRewardsDataPipeline") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()


customers_file = "/Users/rafaelcamara/dev/ballastlane/archive/customers.csv"
offers_file = "/Users/rafaelcamara/dev/ballastlane/archive/offers.csv"
events_file = "/Users/rafaelcamara/dev/ballastlane/archive/events.csv"

print("Ingesting raw data...")
customers_df = spark.read.csv(customers_file, header=True, inferSchema=True)
offers_df = spark.read.csv(offers_file, header=True, inferSchema=True)
raw_events_df = spark.read.csv(events_file, header=True, inferSchema=True)


# Identify if the offer was completed
events_df = raw_events_df.withColumn(
    "offer_completed",
    when(trim(col("event")) == "offer completed", lit(1)).otherwise(lit(0))
)

events_df = events_df.withColumn("json_details", regexp_replace("value", "'", '"'))

# Extract offer_id
events_df = events_df.withColumn("json_clean", regexp_replace("json_details", "'", '"'))
schema = StructType().add("offer id", StringType())
events_df = events_df.withColumn("json_parsed", from_json("json_clean", schema))
events_df = events_df.withColumn("offer_id", col("json_parsed.`offer id`"))
staging_events_df = events_df

#events_df = events_df.na.drop(subset=["customer_id", "offer_id"])

# removing unnecessary columns
#events_df = events_df.drop("value", "json_details", "json_clean", "json_parsed")
events_df = events_df.drop("json_details", "json_clean", "json_parsed")

events_df = events_df.withColumn("event_timestamp_readable", 
                                             from_unixtime(col("time")))
events_df = events_df.withColumn("event_datetime", 
                                             to_timestamp(col("event_timestamp_readable")))

# Convert offer_start_date and offer_end_date to date type
#offers_df = offers_df.withColumn("offer_start_date_dt", to_timestamp(col("offer_start_date"), "yyyy-MM-dd"))
#offers_df = offers_df.withColumn("offer_end_date_dt", to_timestamp(col("offer_end_date"), "yyyy-MM-dd"))

# Join dataframes
# Join events with offers to get offer details for each event
events_with_offers = events_df.join(offers_df, "offer_id", "left")

# Join with customers to get customer demographics
full_df = events_with_offers.join(customers_df, "customer_id", "left")

# Feature Engineering
# Calculate offer duration
#full_df = full_df.withColumn("offer_duration_days", datediff(col("offer_end_date_dt"), col("offer_start_date_dt")))

# Identify completed offers (event within offer period and event_type is \'purchase\')
# Calculate offer end datetime by adding duration (in days) to event_datetime for each offer received

# First, join the duration from offers_df to full_df if not already present
# (Already present as 'duration' in full_df)
#events_df = events_df.withColumn("offer_id", get_json_object(col("value"), "$.\'offer id\'"))



# Analytical Questions
print("\n--- Analytical Results ---")

# 1. Total number of customers
total_customers = customers_df.count()
print(f"Total number of customers: {total_customers}")

# 2. Total number of offers
total_offers = offers_df.count()
print(f"Total number of offers: {total_offers}")

# 3. Total number of events
total_events = events_df.count()
print(f"Total number of events: {total_events}")

# 4. Number of unique customers who made a purchase
unique_purchasing_customers = full_df.filter(col("event") == "transaction") \
                                     .select("customer_id").distinct().count()
print(f"Number of unique customers who made a purchase: {unique_purchasing_customers}")

# 5. Top 5 most popular offers (by number of views or purchases)
# Assuming popularity is based on purchases within the offer period
top_5_offers = full_df.filter(col("offer_completed") == 1) \
                      .groupBy("offer_id", "offer_type") \
                      .agg(count("offer_id").alias("purchase_count")) \
                      .orderBy(col("purchase_count").desc()) \
                      .limit(5)
print("\nTop 5 most popular offers (by number of completed offers):")
top_5_offers.show(truncate=False)

# 6. Average transaction amount
# Filter for purchase events only, extract "amount" from JSON in "value"
average_transaction_amount = events_df.filter(col("event") == "transaction") \
    .agg(avg(get_json_object(col("value"), "$.amount").cast("double"))).collect()[0][0]
print(f"Average transaction amount: {average_transaction_amount:.2f}")

# 7. Number of offers completed vs. not completed
offers_completion_status = events_df.groupBy("offer_completed") \
                                   .agg(count("offer_id").alias("count"))
print("\nNumber of offers completed vs. not completed:")
offers_completion_status.show()

# 8. Distribution of customers by age group
# Assuming age is available in customers_df and is numerical
# For simplicity, let\'s define some age groups
customers_df = customers_df.withColumn("age_group", 
                                     when(col("age") < 25, "<25") \
                                     .when((col("age") >= 25) & (col("age") < 35), "25-34") \
                                     .when((col("age") >= 35) & (col("age") < 45), "35-44") \
                                     .when((col("age") >= 45) & (col("age") < 55), "45-54") \
                                     .when((col("age") >= 55) & (col("age") < 65), "55-64") \
                                     .otherwise("65+"))

age_distribution = customers_df.groupBy("age_group").agg(count("customer_id").alias("customer_count")) \
                               .orderBy("age_group")
print("\nCustomer distribution by age group:")
age_distribution.show()

# 9. Offers with highest completion rate
# Calculate total offers viewed/received and completed offers per offer_id
offer_summary = full_df.groupBy("offer_id", "offer_type") \
                         .agg(count("offer_id").alias("total_events"), 
                              sum(col("offer_completed")).alias("completed_offers"))

from pyspark.sql.functions import when

offer_completion_rate = offer_summary.withColumn(
    "completion_rate",
    when(col("total_events") != 0, (col("completed_offers") / col("total_events")) * 100).otherwise(None)
)

highest_completion_rate_offers = offer_completion_rate.orderBy(col("completion_rate").desc()) \
                                                       .limit(5)
print("\nOffers with highest completion rate:")
highest_completion_rate_offers.show(truncate=False)

# 10. Customer lifetime value (CLV) - simplified: total transaction amount per customer
customer_clv = full_df.filter(col("event") == "transaction") \
                       .groupBy("customer_id") \
                       .agg(sum(get_json_object(col("value"), "$.amount")).alias("total_spent")) \
                       .orderBy(col("total_spent").desc())

events_df = events_df.drop("value")

print("\nTop 10 Customers by Lifetime Value (Total Spent):")
customer_clv.limit(10).show()

spark.stop()

Ingesting raw data...

--- Analytical Results ---
Total number of customers: 17000
Total number of offers: 10
Total number of events: 306534
Number of unique customers who made a purchase: 16578

Top 5 most popular offers (by number of completed offers):
+--------+----------+--------------+
|offer_id|offer_type|purchase_count|
+--------+----------+--------------+
|NULL    |NULL      |0             |
+--------+----------+--------------+

Average transaction amount: 12.78

Number of offers completed vs. not completed:
+---------------+------+
|offer_completed| count|
+---------------+------+
|              1|     0|
|              0|134002|
+---------------+------+


Customer distribution by age group:
+---------+--------------+
|age_group|customer_count|
+---------+--------------+
|    25-34|          1380|
|    35-44|          1869|
|    45-54|          3013|
|    55-64|          3421|
|      65+|          6441|
|      <25|           876|
+---------+--------------+


Offers with highes

                                                                                

+--------------------+------------------+
|         customer_id|       total_spent|
+--------------------+------------------+
|3c8d541112a74af99...|           1608.69|
|f1d65ae63f174b8f8...|1365.6600000000003|
|ae6f43089b674728a...|           1327.74|
|626df8678e2a4953b...|           1321.42|
|73afdeca19e349b98...|           1319.97|
|52959f19113e4241a...|           1292.86|
|ad1f0a409ae642bc9...|1258.1899999999998|
|d240308de0ee4cf8b...|           1251.99|
|946fc0d3ecc4492aa...|            1232.4|
|6406abad8e2c4b858...|           1211.76|
+--------------------+------------------+

