In [0]:
import pandas as pd

base_url = "https://raw.githubusercontent.com/sriram1105-m/Customer-Intelligence-Platform/main/data/final/"

# Load customer segments table
file_name = "customer_segments.csv"
pdf = pd.read_csv(base_url + file_name)
customer_segments_df = spark.createDataFrame(pdf)

# Quick check
print(f"Rows: {customer_segments_df.count()} | Columns: {len(customer_segments_df.columns)}")
display(customer_segments_df.limit(5))

Rows: 15994 | Columns: 17


customer_id,recency_days,frequency,monetary_value,avg_order_value,payment_method,total_returns,return_rate,total_sessions,active_days,engagement_score,total_tickets,avg_resolution_days,open_tickets,top_category,top_brand,segment
000f8432-1236-4c8e-af82-80b824ebacd9,212,4,1739.75,434.94,Cash,0,0.0,2,2,0,0,0.0,0,Home,Coleman-Wang,Churn Risk
0061a414-3fea-40dd-9bf7-e35e8eac931f,29,2,508.26,254.13,UPI,0,0.0,2,2,0,0,0.0,0,Clothing,"Gibson, Robinson and Moore",Regular
0061a414-3fea-40dd-9bf7-e35e8eac931f,29,2,508.26,254.13,Wallet,0,0.0,2,2,0,0,0.0,0,Clothing,"Gibson, Robinson and Moore",Regular
006cf236-8150-457a-815b-0017d356d868,243,3,2854.94,951.65,UPI,0,0.0,3,3,0,0,0.0,0,Clothing,Fischer Group,Churn Risk
00c00081-3d80-40f3-a2ba-22491aca1957,203,5,3219.4400000000005,643.89,Cash,0,0.0,7,7,0,1,0.0,1,Home,Becker-Jones,Churn Risk


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# 1. Assigning the number of scoring buckets
NUM_BUCKETS = 5

# 2. RFM Scoring using quantiles
# Recency: (Lower recency_days = better score)
recency_window = Window.orderBy(F.col("recency_days"))

customer_scores_df = customer_segments_df.withColumn(
    "R_score",
    F.ntile(NUM_BUCKETS).over(recency_window)
).withColumn("R_score", (NUM_BUCKETS + 1 - F.col("R_score")))  # Reverse so recent gets higher score

# Frequency: (Higher Frequency = better score)
freq_window = Window.orderBy(F.col("frequency").desc())

customer_scores_df = customer_scores_df.withColumn(
    "F_score",
    F.ntile(NUM_BUCKETS).over(freq_window)
)

# Monetary: (Higher spend = better score)
monetary_window = Window.orderBy(F.col("monetary_value").desc())

customer_scores_df = customer_scores_df.withColumn(
    "M_score",
    F.ntile(NUM_BUCKETS).over(monetary_window)
)

# 3. Weighted Overall Score
customer_scores_df = customer_scores_df.withColumn(
    "overall_score",
    F.round(
        0.4 * F.col("M_score") +
        0.3 * F.col("R_score") +
        0.3 * F.col("F_score"), 2
    )
)

# 4. Quick Check
display(customer_scores_df.select("customer_id", "R_score", "F_score", "M_score", "overall_score", "segment").limit(10))




customer_id,R_score,F_score,M_score,overall_score,segment
d551c651-ed73-437b-a0fa-0431eec32330,4,1,1,1.9,Loyal
4b0ca07d-5d92-4ebc-9cdd-a8d5e7f4de7d,5,1,1,2.2,High Value
ef4e70f5-7dce-4781-a520-d07f58ed36d5,5,1,1,2.2,Loyal
3dc1ac29-f1a2-4eba-94d3-b75bf01c47e5,5,1,1,2.2,High Value
68ab34bd-80d7-46e3-a92e-5abca973d0c8,5,1,1,2.2,High Value
21375e94-505f-4957-b1ce-fe0cd9879ba0,5,1,1,2.2,High Value
ee660523-efb6-4261-b1b9-aa1fabc06d2a,4,1,1,1.9,Loyal
567c1b55-b9d0-4030-963e-72688fd8b569,4,1,1,1.9,Loyal
5f59d385-aa45-41b9-8f95-285a5f4f1e1a,3,1,1,1.6,Regular
5f59d385-aa45-41b9-8f95-285a5f4f1e1a,3,1,1,1.6,Regular


In [0]:
# Saving the Customer Scoring Table

customer_scores_df.toPandas().to_csv("customer_scores.csv", index=False)

