In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

In [2]:
path = "./DataSources/before_analysis.csv"
df = pd.read_csv(path)

In [3]:
df.head()

Unnamed: 0,transaction_id,uid,order_value,created_at,merchant_name,category
0,464050142633304,VOXLAT0006010732025283345,22.08,2024-02-19 03:57:48,MILKRUN New Zealand,Retailers
1,304050138039532,VOXLAT0006010732026096480,36.99,2024-02-19 03:50:07,Cotton On New Zealand,Fashion & Retail
2,384050128359200,VOXLAT0006010732026401300,180.44,2024-02-19 03:33:59,MILKRUN New Zealand,Retailers
3,384050123292049,VOXLAT0006010732021856243,270.0,2024-02-19 03:25:33,Foot Locker New Zealand,Fashion & Retail
4,584050056781674,VOXLAT0006010732020290139,105.95,2024-02-19 01:34:43,Sportsfuel New Zealand,Retailers


## RFM calculation for the Unique customers

In [4]:
# Calculation of RFM values
df['created_at'] = pd.to_datetime(df['created_at'])

# Define the reference date
reference_date = datetime(2024, 2, 28)

# Calculate Frequency and Monetary values for each customer
fm_values = df.groupby('uid').agg(frequency=('uid', 'count'), monetary=('order_value', 'sum')).reset_index()

# Calculate Recency for each customer based on their most recent transaction
recency_values = df.groupby('uid').agg(most_recent_purchase=('created_at', 'max')).reset_index()
recency_values['recency'] = (reference_date - recency_values['most_recent_purchase']).dt.days

# Merge Frequency and Monetary with Recency values
rfm_table = recency_values.merge(fm_values, on='uid', how='left')

# Drop the 'most_recent_purchase' as it's no longer needed after calculating 'recency'
rfm_table.drop(columns=['most_recent_purchase'], inplace=True)

rfm_table

Unnamed: 0,uid,recency,frequency,monetary
0,1897bfcf9c2bcab8462930b3d0d952f35cb6,66,1,317.00
1,56cc9e626b422ecbe87247f6f0dffc289bce,341,2,94.08
2,VOXLAT0006010732020002203,151,1,63.92
3,VOXLAT0006010732020002641,19,1,290.00
4,VOXLAT0006010732020003193,227,1,92.55
...,...,...,...,...
4463,VOXLAT0006010732026626773,26,2,138.51
4464,VOXLAT0006010732026628894,47,1,24.99
4465,VOXLAT0006010732026629546,76,1,127.42
4466,VOXLAT0006010732026630502,63,4,926.48


# **RFM Calculation for every transactions**
but rfm calculated correctly for every customer

In [5]:
df['created_at'] = pd.to_datetime(df['created_at'])

# Define the reference date
reference_date = datetime(2024, 2, 28)

# Calculate Frequency and Monetary values for each customer
fm_values = df.groupby('uid').agg(frequency=('uid', 'count'), monetary=('order_value', 'sum')).reset_index()

# Re-calculate Recency for each transaction as days from the reference date
df['recency'] = (reference_date - df['created_at']).dt.days

# We already have Frequency and Monetary values calculated at the customer level in 'fm_values'
# Join these values back to the original dataframe on 'uid'
df_rfm = df.merge(fm_values, on='uid', how='left')

# Selecting relevant columns to showcase the result
df_rfm_selected = df_rfm[['transaction_id', 'uid', 'created_at', 'recency', 'frequency', 'monetary']]

df_rfm_selected

Unnamed: 0,transaction_id,uid,created_at,recency,frequency,monetary
0,464050142633304,VOXLAT0006010732025283345,2024-02-19 03:57:48,8,1,22.08
1,304050138039532,VOXLAT0006010732026096480,2024-02-19 03:50:07,8,1,36.99
2,384050128359200,VOXLAT0006010732026401300,2024-02-19 03:33:59,8,1,180.44
3,384050123292049,VOXLAT0006010732021856243,2024-02-19 03:25:33,8,3,611.13
4,584050056781674,VOXLAT0006010732020290139,2024-02-19 01:34:43,8,12,768.28
...,...,...,...,...,...,...
9995,583072224012725,VOXLAT0006010732024789201,2023-03-13 06:13:27,351,1,200.00
9996,383072140617909,VOXLAT0006010732021659027,2023-03-13 03:54:30,351,2,196.93
9997,463072111137846,VOXLAT0006010732021914109,2023-03-13 03:05:24,351,8,618.87
9998,463072068007932,VOXLAT0006010732021426575,2023-03-13 01:53:26,351,42,6135.33


# Merging the RFM values with the original dataframe


In [10]:
df_rfm = df
df_rfm["recency"] = df_rfm_selected["recency"]
df_rfm["frequency"] = df_rfm_selected["frequency"]
df_rfm["monetary"] = df_rfm_selected["monetary"]

# Validating the merge is happened correctly
print((df_rfm.transaction_id == df_rfm_selected.transaction_id).value_counts())

# Save csv
# df_rfm.to_csv("./DataSources/df_rfm.csv", index=False)



transaction_id
True    10000
Name: count, dtype: int64


# END