In [0]:
import pandas as pd

base_url = "https://raw.githubusercontent.com/sriram1105-m/Customer-Intelligence-Platform/main/data/final/"

# Load customer segments table
file_name = "customer_scores.csv"
pdf = pd.read_csv(base_url + file_name)
customer_scores_df = spark.createDataFrame(pdf)

# Quick check
print("Customer Scores Table Loaded:")
print(f"Rows: {customer_scores_df.count()} | Columns: {len(customer_scores_df.columns)}")

Customer Scores Table Loaded:
Rows: 15994 | Columns: 21


In [0]:
import pandas as pd

base_url = "https://raw.githubusercontent.com/sriram1105-m/Customer-Intelligence-Platform/main/data/cleaned/"

# Load Transactions Cleaned Data and Products Cleaned Data
# Transaction data
transactions_file = "transactions_clean.csv"
pdf_transactions = pd.read_csv(base_url + transactions_file)
transactions_clean_df = spark.createDataFrame(pdf_transactions)

# Products data
products_file = "products_clean.csv"
pdf_products = pd.read_csv(base_url + products_file)
products_clean_df = spark.createDataFrame(pdf_products)

# Quick Check - Row Counts
print(f"Transactions Rows: {transactions_clean_df.count()}")
print(f"Products Rows: {products_clean_df.count()}")

# Optional: Preview top rows
display(transactions_clean_df.limit(5))
display(products_clean_df.limit(5))

Transactions Rows: 50000
Products Rows: 500


transaction_id,customer_id,product_id,quantity,price,amount,payment_method,transaction_date
00010409-9665-4cc8-8fbc-d6d86970941d,b2e44849-5c5a-4cd9-9084-4ed0625e53c8,P0444,4,367.87,1471.48,Wallet,2025-07-16
0005183c-f099-4410-9500-43f7ea50341a,dcbee60d-1164-4eda-b866-f9483302c5b0,P0071,3,341.04,1023.12,Cash,2023-09-22
00060056-d7c3-4ffd-b513-d7f58f8b3f00,4cac1f02-b9bf-48bf-9ca0-932f12a0955f,P0236,5,85.96,429.8,Wallet,2024-05-18
00061d12-f058-484e-9e12-7b8497008299,a0db6103-f693-4596-ba53-6227102cedfc,P0341,3,208.19,624.57,UPI,2024-09-22
00084816-476f-4e1a-941d-d2cec8bc2347,550a708c-c999-4801-8cdd-cea1a4f58f57,P0116,4,63.2,252.8,Wallet,2025-01-12


product_id,product_name,category,brand,cost_price,selling_price
P0001,Majority,Sports,Weaver LLC,106.89,181.71
P0002,Provide,Home,Garcia-Alvarado,121.83,270.46
P0003,Whatever,Home,Cisneros Group,158.81,198.51
P0004,Scene,Home,"Williams, Santos and Bailey",65.93,153.62
P0005,Beyond,Grocery,Boyd Inc,147.06,167.65


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# 1. Total Revenue & Average Order Value
kpi_totals_df = (customer_scores_df
                 .agg(
                     F.sum("monetary_value").alias("total_revenue"),
                     F.round(F.avg("monetary_value"), 2).alias("avg_order_value")
                     )
                )

# 2. Customer Activity KPIs
total_customers = customer_scores_df.count()
active_customers = customer_scores_df.filter(F.col("segment") != "Churn Risk").count()
loyal_customers = customer_scores_df.filter(F.col("segment").isin("Loyal", "High Value")).count()
churn_customers = customer_scores_df.filter(F.col("segment") == "Churn Risk").count()

kpi_activity_df = spark.createDataFrame([{
    "total_customers" : total_customers,
    "active_customers" : active_customers,
    "active_customer_pct" : round((active_customers / total_customers) * 100, 2),
    "loyal_customers" : loyal_customers,
    "loyal_customer_pct" : round((loyal_customers / total_customers) * 100, 2),
    "churn_customers" : churn_customers,
    "churn_rate_pct" : round((churn_customers / total_customers) * 100, 2)
}])

# 3. Revenue by Segment
kpi_segment_df = (customer_scores_df
                  .groupBy("segment")
                  .agg(
                      F.sum("monetary_value").alias("revenue_by_segment"),
                      F.round(F.avg("monetary_value"), 2).alias("avg_order_value_segment")
                      )
                  .orderBy(F.col("revenue_by_segment").desc())
                  )

# 4. Monthly Revenue Trend
monthly_revenue_df = (transactions_clean_df
                      .withColumn("month", F.date_format("transaction_date", "yyyy-MM"))
                      .groupBy("month")
                      .agg(F.sum("amount").alias("monthly_revenue"))
                      .orderBy("month")
                      )

# 5. Top Customers (Top 10 by spend)
top_customers_df = (customer_scores_df
                    .select("customer_id", "monetary_value")
                    .orderBy(F.col("monetary_value").desc())
                    .limit(10)
                    )

# 6. Top Products (Top 10 by Sales)
top_products_df = (transactions_clean_df
                   .groupBy("product_id")
                   .agg(F.sum("amount").alias("total_sales"))
                   .join(products_clean_df, "product_id", "left")
                   .orderBy(F.col("total_sales").desc())
                   .limit(10)
                   )


In [0]:
# 7. Customer Lifetime Value (CLV)
# Step A: First purchase date per customer
first_purchase_df = (transactions_clean_df
    .groupBy("customer_id")
    .agg(F.min("transaction_date").alias("first_purchase_date"))
)

# Step B: Add months_active for each customer
transactions_with_first = (transactions_clean_df
    .join(first_purchase_df, "customer_id", "left")
    .withColumn("months_active", 
        F.round(F.months_between(F.current_date(), F.col("first_purchase_date")), 1)
    )
)

# Step C: Average Order Value per customer
avg_order_df = (transactions_with_first
    .groupBy("customer_id")
    .agg(F.round(F.avg("amount"), 2).alias("avg_order_value"),
         F.max("months_active").alias("months_active"))
)

# Step D: Purchase frequency per month
purchase_freq_df = (transactions_with_first
    .groupBy("customer_id")
    .agg(F.round(F.count("transaction_id") / F.max("months_active"), 2).alias("purchase_freq_month"))
)

# Step E: CLV = AOV * Frequency * Months Active
clv_df = (avg_order_df
    .join(purchase_freq_df, "customer_id", "left")
    .withColumn("CLV", 
        F.round(F.col("avg_order_value") * F.col("purchase_freq_month") * F.col("months_active"), 2))
    .fillna(0)
)

In [0]:
# 8. Customer Retention Cohort Analysis
# Step A: Extract Cohort Month
transactions_with_cohort = (
    transactions_clean_df
    .withColumn("transaction_month", F.date_format("transaction_date", "yyyy-MM"))
    .withColumn("cohort_month",
        F.date_format(F.min("transaction_date").over(Window.partitionBy("customer_id")), "yyyy-MM")
    )
)

# Step B: Count customers per cohort & month
cohort_counts = (transactions_with_cohort
                 .groupBy("cohort_month", "transaction_month")
                 .agg(F.countDistinct("customer_id").alias("customer_count"))
                 .orderBy("cohort_month", "transaction_month")
                 )

# Step C: Pivot for retention matrix
retention_df = (cohort_counts
                .groupBy("cohort_month")
                .pivot("transaction_month")
                .sum("customer_count")
                .fillna(0)
                )

In [0]:
# 9. Display all KPIs
print(" Total KPIs: ")
display(kpi_totals_df)

print("Activity KPIs: ")
display(kpi_activity_df)

print("Revenueby Segment: ")
display(kpi_segment_df)

print("Monthly Revenue Trend: ")
display(monthly_revenue_df)

print("Top Customers: ")
display(top_customers_df)

print("Top Products: ")
display(top_products_df)

print("Customer Lifetime Value (CLV): ")
display(clv_df)

print("Customer Retention Cohort Matrix: ")
display(retention_df)

 Total KPIs: 


total_revenue,avg_order_value
42923847.289999954,2683.75


Activity KPIs: 


active_customer_pct,active_customers,churn_customers,churn_rate_pct,loyal_customer_pct,loyal_customers,total_customers
55.52,8880,7114,44.48,22.08,3532,15994


Revenueby Segment: 


segment,revenue_by_segment,avg_order_value_segment
Churn Risk,15963913.779999992,2244.01
Regular,13633713.380000006,2549.31
Loyal,12567660.68999999,3679.06
High Value,758559.4399999996,6539.31


Monthly Revenue Trend: 


month,monthly_revenue
2023-09,697626.0599999999
2023-10,1122466.9199999997
2023-11,1117672.9699999995
2023-12,1213990.8499999996
2024-01,1120444.0399999998
2024-02,1126086.5299999998
2024-03,1103249.2599999995
2024-04,1103712.6299999997
2024-05,1069540.22
2024-06,1092769.5899999994


Top Customers: 


customer_id,monetary_value
d551c651-ed73-437b-a0fa-0431eec32330,10909.65
4b0ca07d-5d92-4ebc-9cdd-a8d5e7f4de7d,10537.27
ef4e70f5-7dce-4781-a520-d07f58ed36d5,10223.07
3dc1ac29-f1a2-4eba-94d3-b75bf01c47e5,10063.8
68ab34bd-80d7-46e3-a92e-5abca973d0c8,9888.61
21375e94-505f-4957-b1ce-fe0cd9879ba0,9831.39
ee660523-efb6-4261-b1b9-aa1fabc06d2a,9701.92
567c1b55-b9d0-4030-963e-72688fd8b569,9593.329999999998
5f59d385-aa45-41b9-8f95-285a5f4f1e1a,9274.27
5f59d385-aa45-41b9-8f95-285a5f4f1e1a,9274.27


Top Products: 


product_id,total_sales,product_name,category,brand,cost_price,selling_price
P0083,156613.6,Rate,Grocery,Shaw Inc,171.37,423.28
P0397,147094.88,Enjoy,Home,"Allen, Baker and Weaver",186.86,448.46
P0348,142252.6,Yard,Sports,"Ingram, Harris and Sanchez",197.74,472.6
P0142,139183.19999999998,Gun,Sports,"Hines, Norris and Thompson",177.35,386.62
P0491,137929.02000000002,Real,Sports,Hendrix-Ruiz,164.4,389.63
P0107,137200.88,Town,Home,Patel-Arnold,189.6,434.18
P0287,136616.48,Bit,Home,Cortez Inc,179.58,443.56
P0368,135696.33000000002,Might,Grocery,Washington and Sons,190.37,456.89
P0191,134150.84,Chance,Clothing,"Marquez, Johnson and Wallace",195.56,353.96
P0036,132634.68,Father,Home,Herrera-Johnston,180.92,419.73


Customer Lifetime Value (CLV): 


customer_id,avg_order_value,months_active,purchase_freq_month,CLV
d8a580b8-4c19-46ba-afaa-e07ac2666d3f,725.64,16.0,0.13,1509.33
59162ae4-44e3-49e2-9c52-3e897c8292b8,547.01,20.2,0.54,5966.79
c866be64-36d4-45b1-864a-2b1165ae6198,557.29,21.4,0.28,3339.28
6acb9ae8-6732-49a0-ad77-f4720e6a9efa,332.7,12.6,0.4,1676.81
557c1ec2-d512-4500-820f-55e28b1dc300,404.52,21.6,0.28,2446.54
96fd27b9-3f67-4748-a90c-281e7eaefc1f,249.87,1.4,0.71,248.37
5b452ab5-3f2a-4ac5-bd2e-23812c01b811,705.44,13.0,0.31,2842.92
2130fc55-edc4-4272-b77d-e4c3c35cccb7,439.12,14.9,0.27,1766.58
899ba871-9497-4f48-beae-6ff5fd1155fa,761.63,21.9,0.18,3002.35
bc52aab0-71e3-4315-ad57-3c6a4c41c211,693.4,20.1,0.3,4181.2


Customer Retention Cohort Matrix: 


cohort_month,2023-09,2023-10,2023-11,2023-12,2024-01,2024-02,2024-03,2024-04,2024-05,2024-06,2024-07,2024-08,2024-09,2024-10,2024-11,2024-12,2025-01,2025-02,2025-03,2025-04,2025-05,2025-06,2025-07,2025-08,2025-09
2023-09,1201,211,199,233,236,238,203,214,226,240,242,239,213,238,218,193,231,234,230,226,210,227,225,227,84
2023-10,0,1691,325,350,300,309,319,321,298,294,328,311,320,341,315,324,324,296,300,304,334,329,319,311,107
2023-11,0,0,1315,230,242,224,229,250,240,238,250,285,240,283,228,265,239,232,235,252,249,226,261,243,97
2023-12,0,0,0,1152,213,219,203,243,209,216,231,223,214,215,213,229,233,221,202,223,253,217,193,209,73
2024-01,0,0,0,0,900,174,172,186,147,152,168,165,168,180,150,181,173,165,173,172,165,158,176,155,70
2024-02,0,0,0,0,0,692,147,123,120,142,144,137,144,117,128,124,138,132,120,126,136,133,146,144,45
2024-03,0,0,0,0,0,0,598,118,126,110,98,114,118,114,110,104,123,98,120,121,114,95,102,117,31
2024-04,0,0,0,0,0,0,0,411,75,73,69,81,72,79,83,99,77,92,73,73,72,79,86,75,34
2024-05,0,0,0,0,0,0,0,0,383,66,62,74,54,66,77,61,70,71,68,64,72,70,80,76,48
2024-06,0,0,0,0,0,0,0,0,0,325,61,74,56,68,68,57,59,54,69,67,46,57,55,63,27


In [0]:
# Save all KPI tables separately
kpi_totals_df.toPandas().to_csv("kpi_totals.csv", index=False)
kpi_activity_df.toPandas().to_csv("kpi_activity.csv", index=False)
kpi_segment_df.toPandas().to_csv("kpi_revenue_segment.csv", index=False)
monthly_revenue_df.toPandas().to_csv("kpi_monthly_revenue.csv", index=False)
top_customers_df.toPandas().to_csv("kpi_top_customers.csv", index=False)
top_products_df.toPandas().to_csv("kpi_top_products.csv", index=False)
clv_df.toPandas().to_csv("kpi_clv.csv", index=False)
retention_df.toPandas().to_csv("kpi_retention.csv", index=False)

print("All KPI tables saved separately!")


All KPI tables saved separately!


In [0]:
from pyspark.sql import functions as F

# Get all columns except cohort_month safely
numeric_cols = [c for c in retention_df.columns if c != "cohort_month"]

# If cohort_month is not present, numeric_cols will just have month columns like 2023-09, etc.
if "cohort_month" not in retention_df.columns:
    numeric_cols = retention_df.columns  # use all month columns

# Calculate retention rate = (sum of all cells) / (number of cohorts * first month customers)
total_customers = retention_df.select(F.sum(F.col(numeric_cols[0]))).collect()[0][0]
total_values = retention_df.select([F.sum(F.col(c)) for c in numeric_cols]).collect()
sum_all = sum([r[0] for r in total_values])
num_periods = len(numeric_cols)

# Average retention across periods
retention_rate = round(sum_all / (total_customers * num_periods), 2)
print("Retention Rate:", retention_rate)


Retention Rate: 0.04


In [0]:
# Combining all the KPIs into a single Master KPI Dataset
from pyspark.sql import functions as F

final_kpis_df = spark.createDataFrame([{
    "total_customers": kpi_activity_df.agg(F.sum("total_customers")).collect()[0][0],
    "active_customers": kpi_activity_df.agg(F.sum("active_customers")).collect()[0][0],
    "active_customer_pct": round(kpi_activity_df.agg(F.avg("active_customer_pct")).collect()[0][0], 2),
    "loyal_customers": kpi_activity_df.agg(F.sum("loyal_customers")).collect()[0][0],
    "loyal_customer_pct": round(kpi_activity_df.agg(F.avg("loyal_customer_pct")).collect()[0][0], 2),
    "churn_customers": kpi_activity_df.agg(F.sum("churn_customers")).collect()[0][0],
    "churn_rate_pct": round(kpi_activity_df.agg(F.avg("churn_rate_pct")).collect()[0][0], 2),
    "total_revenue": kpi_segment_df.agg(F.sum("revenue_by_segment")).collect()[0][0],
    "avg_order_value": kpi_segment_df.agg(F.avg("avg_order_value_segment")).collect()[0][0],
    "avg_clv": round(clv_df.agg(F.avg("CLV")).collect()[0][0], 2),
    "retention_rate": retention_rate
}])


In [0]:
# Save Final KPIs Table
final_kpis_df.toPandas().to_csv("customer_kpis.csv", index=False)

print("Final KPIs Table saved as customer_kpis.csv")
display(final_kpis_df)

Final KPIs Table saved as customer_kpis.csv


active_customer_pct,active_customers,avg_clv,avg_order_value,churn_customers,churn_rate_pct,loyal_customer_pct,loyal_customers,retention_rate,total_customers,total_revenue
55.52,8880,2728.84,3752.9225,7114,44.48,22.08,3532,0.04,15994,42923847.289999984


In [0]:
# Count rows and columns for Spark DataFrame
rows = final_kpis_df.count()
cols = len(final_kpis_df.columns)

print(f"Final KPIs Table → {rows} rows × {cols} columns")

Final KPIs Table → 1 rows × 11 columns
