## Calculate RFM Metrics

In [2]:
import pandas as pd
from datetime import datetime
import pandas as pd
from datetime import datetime
# Load raw transaction data
df = pd.read_csv("/content/data.csv")
# Define snapshot date (latest transaction + 1 day)
snapshot_date = df["TransactionStartTime"].max()
snapshot_date = pd.to_datetime(snapshot_date) + pd.Timedelta(days=1)
# Ensure datetime conversion
df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"], errors="coerce")

# Drop invalid rows
df = df.dropna(subset=["TransactionStartTime"])

# Define snapshot date
snapshot_date = df["TransactionStartTime"].max() + pd.Timedelta(days=1)

# RFM calculation
rfm = df.groupby("CustomerId").agg({
    "TransactionStartTime": lambda x: (snapshot_date - x.max()).days,  # Recency
    "CustomerId": "count",                                            # Frequency
    "Amount": "sum"                                                   # Monetary
}).rename(columns={
    "TransactionStartTime": "Recency",
    "CustomerId": "Frequency",
    "Amount": "Monetary"
}).reset_index()

rfm.head()


Unnamed: 0,CustomerId,Recency,Frequency,Monetary
0,CustomerId_1,84,1,-10000.0
1,CustomerId_10,84,1,-10000.0
2,CustomerId_1001,90,5,20000.0
3,CustomerId_1002,26,11,4225.0
4,CustomerId_1003,12,6,20000.0


### Scale RFM Features

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[["Recency", "Frequency", "Monetary"]])


### K-Means Clustering

In [4]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)
rfm["Cluster"] = kmeans.fit_predict(rfm_scaled)

rfm.head()


Unnamed: 0,CustomerId,Recency,Frequency,Monetary,Cluster
0,CustomerId_1,84,1,-10000.0,0
1,CustomerId_10,84,1,-10000.0,0
2,CustomerId_1001,90,5,20000.0,0
3,CustomerId_1002,26,11,4225.0,2
4,CustomerId_1003,12,6,20000.0,2


### Define High-Risk Cluster

In [5]:
cluster_summary = rfm.groupby("Cluster")[["Recency", "Frequency", "Monetary"]].mean()
print(cluster_summary)

# High-risk cluster = lowest Frequency & Monetary
high_risk_cluster = cluster_summary.sort_values(["Frequency", "Monetary"]).index[0]

rfm["is_high_risk"] = (rfm["Cluster"] == high_risk_cluster).astype(int)
rfm.head()


           Recency    Frequency      Monetary
Cluster                                      
0        61.859846     7.726699  8.172379e+04
1        29.000000  4091.000000 -1.049000e+08
2        12.716076    34.807692  2.726546e+05


Unnamed: 0,CustomerId,Recency,Frequency,Monetary,Cluster,is_high_risk
0,CustomerId_1,84,1,-10000.0,0,1
1,CustomerId_10,84,1,-10000.0,0,1
2,CustomerId_1001,90,5,20000.0,0,1
3,CustomerId_1002,26,11,4225.0,2,0
4,CustomerId_1003,12,6,20000.0,2,0


### Integrate Target Variable

In [11]:
processed_df = processed_df.rename(columns={"0": "CustomerId"})
processed_df["CustomerId"] = processed_df["CustomerId"].astype(str)
final_df = processed_df.merge(rfm[["CustomerId", "is_high_risk"]], on="CustomerId", how="left")
# --- Step 5: Save Task 4 output ---
final_df.to_csv("processed_task4.csv", index=False)

In [7]:
print(processed_df.columns.tolist())


['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
