In [11]:
from pymc_marketing.clv import utils

import pandas as pd

Create a simple dataset for testing:

In [17]:
url_cdnow = "https://raw.githubusercontent.com/pymc-labs/pymc-marketing/main/data/cdnow_transactions.csv"
cdnow_transactions = pd.read_csv(url_cdnow)

rfm_data = utils.rfm_summary(
    cdnow_transactions,
    customer_id_col="id",
    datetime_col="date",
    datetime_format="%Y%m%d",
    time_unit="D",
    observation_period_end="19970930",
    time_scaler=7,
)

from pymc_marketing.clv import ParetoNBDModel

model = ParetoNBDModel(rfm_data)
model.fit()

model.fit_summary()

Output()

alpha    10.591
beta      9.756
r         0.560
s         0.550
Name: value, dtype: float64

In [18]:
d = [
    [1, "2015-01-01", 1],
    [1, "2015-02-06", 2],
    [2, "2015-01-01", 2],
    [3, "2015-01-01", 3],
    [3, "2015-01-02", 1],
    [3, "2015-01-05", 5],
    [4, "2015-01-16", 6],
    [4, "2015-02-02", 3],
    [4, "2015-02-05", 3],
    [4, "2015-02-05", 2],
    [5, "2015-01-16", 3],
    [5, "2015-01-17", 1],
    [5, "2015-01-18", 8],
    [6, "2015-02-02", 5],
]
test_data = pd.DataFrame(d, columns=["id", "date", "monetary_value"])

Note customer 4 made two purchases on 2015-02-05. 

`_find_first_transactions` flags the first purchase each customer has made, which must be excluded for modeling. It is called internally by `rfm_summary`.

In [19]:
utils._find_first_transactions(
    transactions=test_data, 
    customer_id_col = "id", 
    datetime_col = "date",
    #monetary_value_col = "monetary_value", 
    #datetime_format = "%Y%m%d",
).reindex()

Unnamed: 0,id,date,first
0,1,2015-01-01,True
1,1,2015-02-06,False
2,2,2015-01-01,True
3,3,2015-01-01,True
4,3,2015-01-02,False
5,3,2015-01-05,False
6,4,2015-01-16,True
7,4,2015-02-02,False
8,4,2015-02-05,False
10,5,2015-01-16,True


Notice how **9** is missing from the dataframe index. Multiple transactions in the same time period are treated as a single purchase, so the indices for those additional transactions are skipped. 

`rfm_summary` is the primary data preprocessing step for CLV modeling in the continuous, non-contractual domain:

In [20]:
rfm_df = utils.rfm_summary(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    monetary_value_col = "monetary_value",
    observation_period_end = "2015-02-06",
    datetime_format = "%Y-%m-%d",
    time_unit = "W",
    include_first_transaction=True,
)

rfm_df.head()

Unnamed: 0,customer_id,frequency,recency,monetary_value
0,1,2.0,0.0,1.5
1,2,1.0,5.0,2.0
2,3,2.0,4.0,4.5
3,4,2.0,0.0,7.0
4,5,1.0,3.0,12.0


For MAP fits and covariate models, `rfm_train_test_split` can be used to evaluate models on unseen data. It is also useful for identifying the impact of a time-based event like a marketing campaign.

In [21]:
train_test = utils.rfm_train_test_split(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    train_period_end = "2015-02-01",
    monetary_value_col = "monetary_value",
)

train_test.head()

Unnamed: 0,customer_id,frequency,recency,T,monetary_value,test_frequency,test_monetary_value,test_T
0,1,0.0,0.0,31.0,0.0,1.0,2.0,5.0
1,2,0.0,0.0,31.0,0.0,0.0,0.0,5.0
2,3,2.0,4.0,31.0,3.0,0.0,0.0,5.0
3,4,0.0,0.0,16.0,0.0,2.0,4.0,5.0
4,5,2.0,2.0,16.0,4.5,0.0,0.0,5.0


`rfm_segments` will assign customer to segments based on their recency, frequency, and monetary value. It uses a quartile-based RFM score approach that is very computationally efficient, but defining custom segments is a rather subjective exercise. The returned dataframe also cannot be used for modeling because it does not zero out the initial transactions.

In [22]:
segments = utils.rfm_segments(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    monetary_value_col = "monetary_value",
    observation_period_end = "2015-02-06",
    datetime_format = "%Y-%m-%d",
    time_unit = "W",
)



In [23]:
segments

Unnamed: 0,customer_id,frequency,recency,monetary_value,rfm_score,segment
0,1,2.0,0.0,1.5,321,Other
1,2,1.0,5.0,2.0,111,Inactive Customer
2,3,2.0,4.0,4.5,122,At Risk Customer
3,4,2.0,0.0,7.0,324,Top Spender
4,5,1.0,3.0,12.0,214,At Risk Customer
5,6,1.0,0.0,5.0,313,Top Spender


`_expected_cumulative_transactions` is a utility function for cumulative plots over time, a useful model evaluation.

In [24]:
from pymc_marketing.clv import ParetoNBDModel
from pymc_marketing.prior import Prior

url_cdnow = "https://raw.githubusercontent.com/pymc-labs/pymc-marketing/main/data/cdnow_transactions.csv"
raw_trans = pd.read_csv(url_cdnow)

rfm_data = utils.rfm_summary(
    raw_trans, 
    customer_id_col = "id", 
    datetime_col = "date", 
    datetime_format = "%Y%m%d",
    time_unit = "D",
    observation_period_end = "19970930",
    time_scaler = 7,
)

model_config = {
    "r_prior": Prior("HalfFlat"),
    "alpha_prior": Prior("HalfFlat"),
    "s_prior": Prior("HalfFlat"),
    "beta_prior": Prior("HalfFlat"),
}

pnbd = ParetoNBDModel(data=rfm_data,model_config=model_config)

pnbd.fit()

df_cum = utils._expected_cumulative_transactions(
    model=pnbd,
    transactions=raw_trans,
    customer_id_col="id",
    datetime_col="date",
    t=25*7,
    datetime_format="%Y%m%d",
    time_unit="D",
    time_scaler= 7,
)
df_cum

Output()

Unnamed: 0,actual,predicted
0,0,4.215266452404725
1,19,16.569582730412932
2,42,37.21457069950916
3,81,66.72145572436291
4,119,105.39241669189728
5,192,153.73378006987227
6,261,210.405988578752
7,351,275.41732383249246
8,428,349.0693288059874
9,504,431.0790584173569


In [31]:
actual = df_cum["actual"].iloc[19:25].values
predicted = df_cum["predicted"].iloc[19:25].values#.round(2)

df_cum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   actual     25 non-null     int64 
 1   predicted  25 non-null     object
dtypes: int64(1), object(1)
memory usage: 528.0+ bytes


Internals of `_expected_cumulative_transactions`:

In [37]:
import numpy as np

z = np.array([])

z = np.append(z,1)

z

array([1.])

In [40]:
import numpy

url_cdnow = "https://raw.githubusercontent.com/pymc-labs/pymc-marketing/main/data/cdnow_transactions.csv"
raw_trans = pd.read_csv(url_cdnow)

# function args
transactions=raw_trans
customer_id_col = "id"
datetime_col = "date"
datetime_format = "%Y%m%d"
t=7
time_unit = "D"
observation_period_end = None #"19970930"
time_scaler = 7
set_index_date = True

start_date = pd.to_datetime(
        transactions[datetime_col], format=datetime_format
    ).min()
start_period = start_date.to_period(time_unit)
observation_period_end = start_period + t

repeated_and_first_transactions = utils._find_first_transactions(
    transactions=raw_trans, 
    customer_id_col = customer_id_col, 
    datetime_col = datetime_col,
    datetime_format = datetime_format,
    time_unit = time_unit,
    observation_period_end = observation_period_end,
)

first_trans_mask = repeated_and_first_transactions["first"]
repeated_transactions = repeated_and_first_transactions[~first_trans_mask]
first_transactions = repeated_and_first_transactions[first_trans_mask]

date_range = pd.date_range(start_date, periods=t + 1, freq=time_unit)
date_periods = date_range.to_period(time_unit)

pred_cum_transactions = np.array([])

# First Transactions on Each Day/Freq
first_trans_size = first_transactions.groupby(datetime_col).size()

# TODO: This loop will require additional work due to modeling differences between pymc-marketing and lifetimes
#       Will need to create a dataframe with unique T for each customer
for i, period in enumerate(date_periods):  # index of period and its date
    if i % time_scaler == 0 and i > 0:
        # Periods before the one being evaluated
        times = numpy.array([d.n for d in period - first_trans_size.index])
        times = times[times > 0].astype(float) / time_scaler

        pred_data = pd.DataFrame(
            {
                "customer_id": times,
                "t":  times,
            }
        )

        expected_trans_agg = pnbd.expected_purchases_new_customer(pred_data)
        
        # Mask for the number of customers with 1st transactions up to the period
        mask = first_trans_size.index < period
        masked_first_trans = first_trans_size[mask].values.reshape(1,1,-1)
        # ``expected_trans`` is a float with the cumulative sum of expected transactions
        expected_trans = (expected_trans_agg * masked_first_trans).sum()

        pred_cum_transactions = np.append(pred_cum_transactions, expected_trans.values)

act_trans = repeated_transactions.groupby(datetime_col).size()
act_tracking_transactions = act_trans.reindex(date_periods, fill_value=0)

act_cum_transactions = []

for j in range(1, t // time_scaler + 1):
    sum_trans = sum(act_tracking_transactions.iloc[: j * time_scaler])
    act_cum_transactions.append(sum_trans)

if set_index_date:
    index = date_periods[time_scaler - 1 : -1 : time_scaler]
else:
    index = range(0, t // time_scaler)

df_cum_transactions = pd.DataFrame(
        {"actual": act_cum_transactions, "predicted": pred_cum_transactions},
        index=index,
    )


df_cum_transactions.info()


<class 'pandas.core.frame.DataFrame'>
PeriodIndex: 1 entries, 1997-01-07 to 1997-01-07
Freq: D
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   actual     1 non-null      int64  
 1   predicted  1 non-null      float64
dtypes: float64(1), int64(1)
memory usage: 24.0 bytes
