# RFM Analysis, Clustering & Customer Segmentation

In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# Loading Data
df = pd.read_csv("../data/processed/clean_data.csv")
print(df.head().to_string())

  Invoice StockCode                          Description  Quantity          InvoiceDate  Price  Customer ID         Country  IsCancelled  TotalAmount
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12  2009-12-01 07:45:00   6.95        13085  United Kingdom        False         83.4
1  489434    79323P                   PINK CHERRY LIGHTS        12  2009-12-01 07:45:00   6.75        13085  United Kingdom        False         81.0
2  489434    79323W                  WHITE CHERRY LIGHTS        12  2009-12-01 07:45:00   6.75        13085  United Kingdom        False         81.0
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48  2009-12-01 07:45:00   2.10        13085  United Kingdom        False        100.8
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24  2009-12-01 07:45:00   1.25        13085  United Kingdom        False         30.0


In [7]:
#Restoring data types from data analysis file
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])
df["Customer ID"] = df["Customer ID"].astype('Int64')

print(f"Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
print(f"Unique customers: {df['Customer ID'].nunique():,}")

Date range: 2009-12-01 07:45:00 to 2011-12-09 12:50:00
Unique customers: 5,942


## RFM Analysis & Customer Segmentation

### Building RFM Features

For RFM Analysis, we mainly need 3 features:
1. Receny
2. Frequency
3. Monetary

We take the reference date to calculate recency to be the last day of data + 1

In [8]:
# Reference date = 1 day after the last transaction in dataset
reference_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)
print(f"Reference date for Recency: {reference_date.date()}")

# RFM computed from NON-CANCELLED transactions only
# # Recency  = days since last purchase
# # Frequency = number of unique purchase invoices
# # Monetary  = total spend from actual purchases

# purchases = df_clean[~df_clean['IsCancelled']].copy()

# rfm = purchases.groupby('Customer ID').agg(
#     Recency=('InvoiceDate', lambda x: (reference_date - x.max()).days),
#     Frequency=('Invoice', 'nunique'),
#     Monetary=('TotalAmount', 'sum')
# ).reset_index()

# print(f"RFM table: {rfm.shape[0]:,} customers\n")
# print(rfm.describe().to_string())

Reference date for Recency: 2011-12-10


In [11]:
#Computing Recency, Frequency and Monetary features

#Only considering the orders that were not cancelled
purchased = df[~df['IsCancelled']].copy()

rfm = purchased.groupby('Customer ID').agg({
    'InvoiceDate': lambda x: (reference_date - x.max()).days,
    'Invoice': 'nunique',
    'TotalAmount': 'sum'
}).reset_index()

rfm.columns = ['Customer ID', 'Recency', 'Frequency', 'Monetary']
print(rfm.head().to_string())

   Customer ID  Recency  Frequency  Monetary
0        12346      326         12  77556.46
1        12347        2          8   4921.53
2        12348       75          5   2019.40
3        12349       19          4   4428.69
4        12350      310          1    334.40


In [18]:
# RFM distributions
fig = make_subplots(rows=1, cols=3,
                    subplot_titles=['Recency (days)', 'Frequency (orders)', 'Monetary (Â£)'])

fig.add_trace(go.Histogram(x=rfm['Recency'], nbinsx=50, 
                           marker_color='#636EFA'), row=1, col=1)
fig.add_trace(go.Histogram(x=rfm[rfm['Frequency'] <= 50]['Frequency'], nbinsx=50, 
                           marker_color='#EF553B'), row=1, col=2)
fig.add_trace(go.Histogram(x=rfm[rfm['Monetary'] < rfm['Monetary'].quantile(0.99)]['Monetary'], 
                           nbinsx=50, marker_color='#00CC96'), row=1, col=3)

fig.update_layout(title_text='RFM Distributions',
                  template='plotly_white', height=400, width=1100, showlegend=False)
fig.show()

print(f"Recency Mean: {rfm['Recency'].mean():.0f} days, Median: {rfm['Recency'].median():.0f} days")
print(f"Frequency Mean: {rfm['Frequency'].mean():.1f} orders, Median: {rfm['Frequency'].median():.0f} orders")
print(f"Monetary Mean: Â£{rfm['Monetary'].mean():.2f}, Median: Â£{rfm['Monetary'].median():.2f}")

Recency Mean: 201 days, Median: 96 days
Frequency Mean: 6.3 orders, Median: 3 orders
Monetary Mean: Â£2954.40, Median: Â£865.60


### RFM Scoring

In [None]:
# Assigning quartile-based scores (1-4)
# Recency: LOWER is better,  4 = most recent, 1 = least recent
# Frequency: HIGHER is better,  4 = most frequent, 1 = least frequent
# Monetary: HIGHER is better, 4 = highest spender, 1 = lowest spender

rfm['R_Score'] = pd.qcut(rfm['Recency'], q=4, labels=[4, 3, 2, 1])
#since frequency can be tied, ranking them based on order
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), q=4, labels=[1, 2, 3, 4])
rfm['M_Score'] = pd.qcut(rfm['Monetary'], q=4, labels=[1, 2, 3, 4])

rfm['RFM_Score'] = rfm['R_Score'].astype(int) + rfm['F_Score'].astype(int) + rfm['M_Score'].astype(int)

print("RFM Score Distribution:")
print(rfm['RFM_Score'].value_counts().sort_index().to_string())
print(f"\nScore range: {rfm['RFM_Score'].min()} to {rfm['RFM_Score'].max()}")

RFM Score Distribution:
RFM_Score
3     572
4     572
5     593
6     625
7     592
8     621
9     566
10    556
11    525
12    659

Score range: 3 to 12


In [19]:
# Assigning segment labels based on RFM score combinations
def assign_segment(row):
    r, f, m = int(row['R_Score']), int(row['F_Score']), int(row['M_Score'])
    
    if r >= 3 and f >= 3 and m >= 3:
        return 'Champions'
    elif r >= 3 and f >= 2 and m >= 2:
        return 'Loyal'
    elif r >= 3 and f <= 2:
        return 'New Customers'
    elif r == 2 and f >= 2 and m >= 2:
        return 'At Risk'
    elif r == 2 and f <= 2:
        return 'Need Attention'
    elif r <= 1 and f >= 2:
        return 'Cant Lose Them'
    else:
        return 'Lost'

rfm['Segment'] = rfm.apply(assign_segment, axis=1)

print("Customer Segments:")
segment_counts = rfm['Segment'].value_counts()
for seg, count in segment_counts.items():
    print(f"  {seg:20s} {count:,} customers ({count/len(rfm)*100:.1f}%)")

Customer Segments:
  Champions            1,821 customers (31.0%)
  At Risk              963 customers (16.4%)
  Lost                 812 customers (13.8%)
  Cant Lose Them       688 customers (11.7%)
  Loyal                665 customers (11.3%)
  Need Attention       485 customers (8.2%)
  New Customers        447 customers (7.6%)


In [20]:
# Segmenting profiles average RFM values per segment
segment_profile = rfm.groupby('Segment').agg(
    Customers=('Customer ID', 'count'),
    Avg_Recency=('Recency', 'mean'),
    Avg_Frequency=('Frequency', 'mean'),
    Avg_Monetary=('Monetary', 'mean'),
    Total_Revenue=('Monetary', 'sum')
).sort_values('Avg_Monetary', ascending=False)

segment_profile['Revenue_Share'] = (segment_profile['Total_Revenue'] / 
                                     segment_profile['Total_Revenue'].sum() * 100)

print("Segment Profiles:")
print(segment_profile.round(2).to_string())

Segment Profiles:
                Customers  Avg_Recency  Avg_Frequency  Avg_Monetary  Total_Revenue  Revenue_Share
Segment                                                                                          
Champions            1821        29.30          14.07       7241.26    13186331.03          75.89
At Risk               963       219.52           5.39       2163.41     2083364.38          11.99
Cant Lose Them        688       486.88           3.14       1145.19      787890.88           4.53
Loyal                 665        36.20           2.92       1123.01      746802.29           4.30
New Customers         447        43.59           1.22        346.37      154828.70           0.89
Lost                  812       520.31           1.13        323.93      263033.00           1.51
Need Attention        485       245.35           1.20        314.54      152553.99           0.88


In [21]:
# Visualizing segment profiles
fig = make_subplots(rows=2, cols=2,
                    subplot_titles=['Customers per Segment', 'Avg Recency by Segment',
                                   'Avg Frequency by Segment', 'Avg Monetary by Segment'])

seg_order = segment_profile.index.tolist()
colors = px.colors.qualitative.Set2[:len(seg_order)]

fig.add_trace(go.Bar(x=seg_order, y=segment_profile['Customers'],
                     marker_color=colors), row=1, col=1)
fig.add_trace(go.Bar(x=seg_order, y=segment_profile['Avg_Recency'],
                     marker_color=colors), row=1, col=2)
fig.add_trace(go.Bar(x=seg_order, y=segment_profile['Avg_Frequency'],
                     marker_color=colors), row=2, col=1)
fig.add_trace(go.Bar(x=seg_order, y=segment_profile['Avg_Monetary'],
                     marker_color=colors), row=2, col=2)

fig.update_layout(title_text='Segment Profiles RFM Averages',
                  template='plotly_white', height=700, width=1000, showlegend=False)
fig.show()

In [22]:
# Revenue share by segment treemap
segment_rev = rfm.groupby('Segment').agg(
    Revenue=('Monetary', 'sum'),
    Customers=('Customer ID', 'count')
).reset_index()

fig = px.treemap(segment_rev, path=['Segment'], values='Revenue',
                 color='Revenue', color_continuous_scale='RdYlGn',
                 title='Revenue Share by Customer Segment')
fig.update_layout(height=500, width=800)
fig.show()

Rule based segmentation is great but it somewhat arbitary. Hence, using a clustering approach like K-Means to find natural clusters in data

### K-Means Clustering on RFM

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Log transform to handle skewness (Frequency and Monetary are heavily right skewed)
rfm_for_clustering = rfm[['Recency', 'Frequency', 'Monetary']].copy()
rfm_for_clustering['Frequency'] = np.log1p(rfm_for_clustering['Frequency'])
rfm_for_clustering['Monetary'] = np.log1p(rfm_for_clustering['Monetary'])

# Scale features
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_for_clustering)

print("Scaled RFM stats:")
print(pd.DataFrame(rfm_scaled, columns=['Recency', 'Frequency', 'Monetary']).describe().round(2).to_string())

Scaled RFM stats:
       Recency  Frequency  Monetary
count  5881.00    5881.00   5881.00
mean     -0.00       0.00      0.00
std       1.00       1.00      1.00
min      -0.96      -1.06     -4.89
25%      -0.84      -1.06     -0.70
50%      -0.50      -0.20     -0.04
75%       0.85       0.66      0.65
max       2.57       5.49      4.63


In [24]:
# Elbow Method + Silhouette Score to find optimal K
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(rfm_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(rfm_scaled, labels))
    print(f"K={k}: Inertia={kmeans.inertia_:.0f}, Silhouette={silhouette_score(rfm_scaled, labels):.4f}")

fig = make_subplots(rows=1, cols=2,
                    subplot_titles=['Elbow Method (Inertia)', 'Silhouette Score'])

fig.add_trace(go.Scatter(x=list(K_range), y=inertias, mode='lines+markers',
                         marker_color='#636EFA'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(K_range), y=silhouette_scores, mode='lines+markers',
                         marker_color='#EF553B'), row=1, col=2)

fig.update_xaxes(title_text='Number of Clusters (K)', row=1, col=1)
fig.update_xaxes(title_text='Number of Clusters (K)', row=1, col=2)
fig.update_yaxes(title_text='Inertia', row=1, col=1)
fig.update_yaxes(title_text='Silhouette Score', row=1, col=2)

fig.update_layout(template='plotly_white', height=400, width=1000, showlegend=False,
                  title_text='Optimal Number of Clusters')
fig.show()

K=2: Inertia=8906, Silhouette=0.4182
K=3: Inertia=5744, Silhouette=0.4008
K=4: Inertia=4504, Silhouette=0.3612
K=5: Inertia=3658, Silhouette=0.3658
K=6: Inertia=3102, Silhouette=0.3480
K=7: Inertia=2757, Silhouette=0.3348
K=8: Inertia=2494, Silhouette=0.3155
K=9: Inertia=2314, Silhouette=0.3125
K=10: Inertia=2167, Silhouette=0.3004


From above we can see K = 3 is the elbow point. Hence the optimal k value = 3

In [25]:
optimal_k = 3

kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

print(f"K-Means with K={optimal_k}")
print(f"Silhouette Score: {silhouette_score(rfm_scaled, rfm['Cluster']):.4f}\n")

print("Cluster Distribution:")
for c in sorted(rfm['Cluster'].unique()):
    count = (rfm['Cluster'] == c).sum()
    print(f"  Cluster {c}: {count:,} customers ({count/len(rfm)*100:.1f}%)")

K-Means with K=3
Silhouette Score: 0.4008

Cluster Distribution:
  Cluster 0: 1,682 customers (28.6%)
  Cluster 1: 1,841 customers (31.3%)
  Cluster 2: 2,358 customers (40.1%)


In [26]:
# Cluster profiles compared to RFM segments
cluster_profile = rfm.groupby('Cluster').agg(
    Customers=('Customer ID', 'count'),
    Avg_Recency=('Recency', 'mean'),
    Avg_Frequency=('Frequency', 'mean'),
    Avg_Monetary=('Monetary', 'mean'),
    Total_Revenue=('Monetary', 'sum')
).sort_values('Avg_Monetary', ascending=False)

cluster_profile['Revenue_Share'] = (cluster_profile['Total_Revenue'] / 
                                     cluster_profile['Total_Revenue'].sum() * 100)

print("Cluster Profiles:")
print(cluster_profile.round(2).to_string())

Cluster Profiles:
         Customers  Avg_Recency  Avg_Frequency  Avg_Monetary  Total_Revenue  Revenue_Share
Cluster                                                                                   
0             1682        57.55          15.91       8535.04    14355944.76          82.63
2             2358        91.91           2.93        847.32     1997970.92          11.50
1             1841       473.24           1.79        554.53     1020888.59           5.88


In [27]:
cluster_names = {
    0: 'Champions',      # High frequency, High spend, Recent
    1: 'Lost',       # Very high recency (473 days), Low frequency
    2: 'Active',   # Moderate recency and spending
}

rfm['Cluster_Name'] = rfm['Cluster'].map(cluster_names)

# Cross-tab: RFM segments vs K-Means clusters
print("RFM Segments vs K-Means Clusters:")
cross = pd.crosstab(rfm['Segment'], rfm['Cluster_Name'], margins=True)
print(cross.to_string())

RFM Segments vs K-Means Clusters:
Cluster_Name    Active  Champions  Lost   All
Segment                                      
At Risk            584        243   136   963
Cant Lose Them       2         27   659   688
Champions          417       1404     0  1821
Lost                36          0   776   812
Loyal              657          8     0   665
Need Attention     215          0   270   485
New Customers      447          0     0   447
All               2358       1682  1841  5881


In [28]:
# 3D scatter plot visualizing clusters in RFM space
fig = px.scatter_3d(rfm, 
                    x='Recency', 
                    y='Frequency', 
                    z='Monetary',
                    color='Cluster_Name',
                    # Using log scales makes the clusters much easier to see
                    log_y=True, 
                    log_z=True,
                    title='Customer Clusters in RFM Space (Log Scale for F & M)',
                    opacity=0.6,
                    color_discrete_map={
                        'Champions': '#00CC96',   # Green
                        'Active': '#636EFA', # Blue
                        'Lost': '#EF553B'     # Red
                    })

fig.update_layout(
    height=700, 
    width=1000, 
    template='plotly_white',
    scene=dict(
        xaxis_title='Recency (Days)',
        yaxis_title='Frequency (Log Orders)',
        zaxis_title='Monetary (Log Spend)'
    )
)

fig.show()

### Statistical Testing Across Segments

#### ANOVA
Does monetary value differ significantly across RFM segments?


In [30]:
from scipy import stats

segments = rfm['Segment'].unique()
groups = [rfm[rfm['Segment'] == s]['Monetary'].values for s in segments]

f_stat, p_value = stats.f_oneway(*groups)

all_data = np.concatenate(groups)
grand_mean = all_data.mean()
ss_between = sum(len(g) * (g.mean() - grand_mean)**2 for g in groups)
ss_total = sum((x - grand_mean)**2 for x in all_data)
eta_squared = ss_between / ss_total

print(f"{'='*55}")
print(f"ANOVA: Monetary Value Across RFM Segments")
print(f"{'='*55}")
for seg, g in zip(segments, groups):
    print(f"  {seg:20s}: mean=Â£{g.mean():.2f}, n={len(g)}")
print(f"\nF-statistic: {f_stat:.4f}")
print(f"p-value: {p_value:.2e}")
print(f"Eta-squared: {eta_squared:.4f}")
print(f"Result: {'Reject HÃ¢â significant difference' if p_value < 0.05 else 'Fail to reject HÃ¢'}")

ANOVA: Monetary Value Across RFM Segments
  At Risk             : mean=Â£2163.41, n=963
  Champions           : mean=Â£7241.26, n=1821
  Need Attention      : mean=Â£314.54, n=485
  Loyal               : mean=Â£1123.01, n=665
  Lost                : mean=Â£323.93, n=812
  New Customers       : mean=Â£346.37, n=447
  Cant Lose Them      : mean=Â£1145.19, n=688

F-statistic: 42.1493
p-value: 1.28e-50
Eta-squared: 0.0413
Result: Reject HÃ¢â significant difference


#### t-test
Champions vs Lost â is the monetary difference significant?

In [31]:
champions = rfm[rfm['Segment'] == 'Champions']['Monetary']
lost = rfm[rfm['Segment'] == 'Lost']['Monetary']

stat, p_value = stats.ttest_ind(champions, lost, equal_var=False)
pooled_std = np.sqrt((champions.std()**2 + lost.std()**2) / 2)
cohens_d = (champions.mean() - lost.mean()) / pooled_std
effect = 'Large' if abs(cohens_d) > 0.8 else 'Medium' if abs(cohens_d) > 0.5 else 'Small'

print(f"{'='*55}")
print(f"Welch's t-test: Champions vs Lost")
print(f"{'='*55}")
print(f"Champions: mean=Â£{champions.mean():.2f}, std=Â£{champions.std():.2f}, n={len(champions)}")
print(f"Lost:      mean=Â£{lost.mean():.2f}, std=Â£{lost.std():.2f}, n={len(lost)}")
print(f"t-statistic: {stat:.4f}")
print(f"p-value: {p_value:.2e}")
print(f"Cohen's d: {cohens_d:.4f} ({effect} effect)")

Welch's t-test: Champions vs Lost
Champions: mean=Â£7241.26, std=Â£24798.52, n=1821
Lost:      mean=Â£323.93, std=Â£645.84, n=812
t-statistic: 11.8943
p-value: 1.78e-31
Cohen's d: 0.3943 (Small effect)


#### Chi-square
Is cancellation behavior independent of customer segment?

In [34]:
cancel_by_cust = df[df['IsCancelled']].groupby('Customer ID')['Invoice'].nunique().rename('Cancelled_Invoices')
rfm_cancel = rfm.merge(cancel_by_cust, on='Customer ID', how='left')
rfm_cancel['Cancelled_Invoices'] = rfm_cancel['Cancelled_Invoices'].fillna(0).astype(int)
rfm_cancel['Has_Cancelled'] = (rfm_cancel['Cancelled_Invoices'] > 0).astype(int)

contingency = pd.crosstab(rfm_cancel['Segment'], rfm_cancel['Has_Cancelled'],
                           margins=True)
contingency.columns = ['Never Cancelled', 'Has Cancelled', 'Total']
print("Cancellation by Segment:")
print(contingency.to_string())

chi2, p_val, dof, expected = stats.chi2_contingency(
    pd.crosstab(rfm_cancel['Segment'], rfm_cancel['Has_Cancelled']))

n = len(rfm_cancel)
min_dim = min(pd.crosstab(rfm_cancel['Segment'], rfm_cancel['Has_Cancelled']).shape) - 1
cramers_v = np.sqrt(chi2 / (n * min_dim))

print(f"\nChiÂ² = {chi2:.4f}, p = {p_val:.2e}, Cramer's V = {cramers_v:.4f}")
print(f"Result: {'Reject HÃ¢ cancellation behavior depends on segment' if p_val < 0.05 else 'Fail to reject HÃ¢'}")

Cancellation by Segment:
                Never Cancelled  Has Cancelled  Total
Segment                                              
At Risk                     474            489    963
Cant Lose Them              435            253    688
Champions                   513           1308   1821
Lost                        689            123    812
Loyal                       448            217    665
Need Attention              415             70    485
New Customers               396             51    447
All                        3370           2511   5881

ChiÂ² = 1283.8329, p = 3.42e-274, Cramer's V = 0.4672
Result: Reject HÃ¢ cancellation behavior depends on segment


Champions have higher cancellation rate, that means they buy a lot as well

### Key Takeaways: Customer Segmentation & Statistical Validation

#### 1. RFM & K-Means Integration
* **The "Engine" of the Business:** Both manual RFM scoring and K-Means ($k=3$) identify a "VIP/Champion" group that is the lifeblood of the store. This group accounts for only ~28% of the customer base but generates over **82% of the total revenue**.
* **Model Validation:** The high degree of overlap between manual segments (Champions/Loyal) and K-Means Cluster 0 (VIPs) validates that our behavioral features are highly predictive and consistent.
* **Mathematical vs. Business Logic:** While $k=3$ is mathematically optimal (highest Silhouette score of 0.40), the manual RFM segments provide the granularity needed for specific marketing actions (e.g., distinguishing "New Customers" from "Need Attention").

#### 2. Statistical Significance of Segments
* **Monetary Value (ANOVA):** We successfully rejected the null hypothesis ($p \approx 1.28e-50$), proving that the differences in spending between segments are **statistically significant** and not due to random chance. 
* **The Spending Gap (t-test):** A WelchÃ¢â¬â¢s t-test confirmed a massive gap between "Champions" (mean ~ÃÂ£7,241) and "Lost" customers (mean ~ÃÂ£324). Even though the effect size (Cohen's d: 0.39) is considered small due to the high variance/outliers in the VIP group, the raw financial difference is substantial.

#### 3. Insights on Cancellation Behavior
* **Cancellations are NOT Random:** The Chi-square test ($p \approx 3.42e-274$) strongly indicates that cancellation behavior depends on the customer segment.
* **The "Champion" Paradox:** Interestingly, "Champions" have the highest number of cancellations. This is not necessarily negative; it indicates that our most active customers are also the ones most frequently interacting with the return systemÃ¢â¬âa common pattern in high-volume retail.
* **CramÃÂ©r's V (0.467):** This reflects a **strong association**. It suggests that knowing a customer's segment is a powerful predictor of whether they are likely to cancel or return an item in the future.

#### 4. Strategic Recommendations
* **Protect the VIPs (Cluster 0):** Since they drive 82% of revenue, even a 5% churn in this group would be catastrophic. Implement a loyalty program or dedicated support for this cluster.
* **Convert the "Active / Mid-Value" (Cluster 2):** These are customers who shop recently but not frequently. They represent the biggest "Upsell" opportunity to move them into the Champion tier.
* **Re-evaluate the "Lost" (Cluster 1):** With a mean recency of 473 days, marketing spend on these customers should be minimal. They are likely churned, and reactivation will be high-cost/low-reward.

In [36]:
# Export RFM data 
rfm.to_csv("../data/processed/rfm_segmented.csv", index=False)