In [1]:
import numpy as np
import pandas as pd
from pymc_marketing.clv import utils

import pytensor
pytensor.config.cxx = '/usr/bin/clang++'



Create a simple dataset for testing:

In [9]:
url_cdnow = "https://raw.githubusercontent.com/pymc-labs/pymc-marketing/main/data/cdnow_transactions.csv"
cdnow_transactions = pd.read_csv(url_cdnow)

rfm_data = utils.rfm_summary(
    cdnow_transactions,
    customer_id_col="id",
    datetime_col="date",
    datetime_format="%Y%m%d",
    time_unit="D",
    observation_period_end="19970930",
    time_scaler=1,
)

from pymc_marketing.clv import ParetoNBDModel, BetaGeoModel

model = BetaGeoModel(rfm_data)
model.fit(fit_method='map')

model.fit_summary()

Output()

a         0.793
alpha    30.895
b         2.426
r         0.243
Name: value, dtype: float64

In [10]:
df_cum = utils._expected_cumulative_transactions(
    model=model,
    transactions=cdnow_transactions,
    customer_id_col="id",
    datetime_col="date",
    t=14,
    datetime_format="%Y%m%d",
    time_unit="D",
    time_scaler= 1,
    set_index_date=True,
)
df_cum

Unnamed: 0,actual,predicted
1997-01-01,0,0.140646
1997-01-02,0,0.451834
1997-01-03,0,0.892874
1997-01-04,0,1.486012
1997-01-05,0,2.253299
1997-01-06,0,3.208783
1997-01-07,0,4.405474
1997-01-08,3,5.731812
1997-01-09,6,7.233624
1997-01-10,7,8.862547


In [None]:
d = [
    [1, "2015-01-01", 1],
    [1, "2015-02-06", 2],
    [2, "2015-01-01", 2],
    [3, "2015-01-01", 3],
    [3, "2015-01-02", 1],
    [3, "2015-01-05", 5],
    [4, "2015-01-16", 6],
    [4, "2015-02-02", 3],
    [4, "2015-02-05", 3],
    [4, "2015-02-05", 2],
    [5, "2015-01-16", 3],
    [5, "2015-01-17", 1],
    [5, "2015-01-18", 8],
    [6, "2015-02-02", 5],
]
test_data = pd.DataFrame(d, columns=["id", "date", "monetary_value"])

Note customer 4 made two purchases on 2015-02-05. 

`_find_first_transactions` flags the first purchase each customer has made, which must be excluded for modeling. It is called internally by `rfm_summary`.

In [None]:
utils._find_first_transactions(
    transactions=test_data, 
    customer_id_col = "id", 
    datetime_col = "date",
    #monetary_value_col = "monetary_value", 
    #datetime_format = "%Y%m%d",
).reindex()

Notice how **9** is missing from the dataframe index. Multiple transactions in the same time period are treated as a single purchase, so the indices for those additional transactions are skipped. 

`rfm_summary` is the primary data preprocessing step for CLV modeling in the continuous, non-contractual domain:

In [None]:
rfm_df = utils.rfm_summary(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    monetary_value_col = "monetary_value",
    observation_period_end = "2015-02-06",
    datetime_format = "%Y-%m-%d",
    time_unit = "W",
    include_first_transaction=True,
)

rfm_df.head()

For MAP fits and covariate models, `rfm_train_test_split` can be used to evaluate models on unseen data. It is also useful for identifying the impact of a time-based event like a marketing campaign.

In [None]:
train_test = utils.rfm_train_test_split(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    train_period_end = "2015-02-01",
    monetary_value_col = "monetary_value",
)

train_test.head()

`rfm_segments` will assign customer to segments based on their recency, frequency, and monetary value. It uses a quartile-based RFM score approach that is very computationally efficient, but defining custom segments is a rather subjective exercise. The returned dataframe also cannot be used for modeling because it does not zero out the initial transactions.

In [None]:
segments = utils.rfm_segments(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    monetary_value_col = "monetary_value",
    observation_period_end = "2015-02-06",
    datetime_format = "%Y-%m-%d",
    time_unit = "W",
)

In [None]:
segments

`_expected_cumulative_transactions` is a utility function for cumulative plots over time, a useful model evaluation.

In [None]:
from pymc_marketing.clv import ParetoNBDModel
from pymc_marketing.prior import Prior

url_cdnow = "https://raw.githubusercontent.com/pymc-labs/pymc-marketing/main/data/cdnow_transactions.csv"
raw_trans = pd.read_csv(url_cdnow)

rfm_data = utils.rfm_summary(
    raw_trans, 
    customer_id_col = "id", 
    datetime_col = "date", 
    datetime_format = "%Y%m%d",
    time_unit = "D",
    observation_period_end = "19970930",
    time_scaler = 7,
)

model_config = {
    "r_prior": Prior("HalfFlat"),
    "alpha_prior": Prior("HalfFlat"),
    "s_prior": Prior("HalfFlat"),
    "beta_prior": Prior("HalfFlat"),
}

pnbd = ParetoNBDModel(data=rfm_data,model_config=model_config)

pnbd.fit()

df_cum = utils._expected_cumulative_transactions(
    model=pnbd,
    transactions=raw_trans,
    customer_id_col="id",
    datetime_col="date",
    t=25*7,
    datetime_format="%Y%m%d",
    time_unit="D",
    time_scaler= 7,
)
df_cum

In [None]:
actual = df_cum["actual"].iloc[19:25].values
predicted = df_cum["predicted"].iloc[19:25].values#.round(2)

df_cum.info()

Internals of `_expected_cumulative_transactions`:

In [None]:
import numpy as np

url_cdnow = "https://raw.githubusercontent.com/pymc-labs/pymc-marketing/main/data/cdnow_transactions.csv"
raw_trans = pd.read_csv(url_cdnow)

# function args
transactions=raw_trans
customer_id_col = "id"
datetime_col = "date"
datetime_format = "%Y%m%d"
t=7
time_unit = "D"
observation_period_end = None #"19970930"
time_scaler = 7
set_index_date = True

start_date = pd.to_datetime(
        transactions[datetime_col], format=datetime_format
    ).min()
start_period = start_date.to_period(time_unit)
observation_period_end = start_period + t

repeated_and_first_transactions = utils._find_first_transactions(
    transactions=raw_trans, 
    customer_id_col = customer_id_col, 
    datetime_col = datetime_col,
    datetime_format = datetime_format,
    time_unit = time_unit,
    observation_period_end = observation_period_end,
)

first_trans_mask = repeated_and_first_transactions["first"]
repeated_transactions = repeated_and_first_transactions[~first_trans_mask]
first_transactions = repeated_and_first_transactions[first_trans_mask]

date_range = pd.date_range(start_date, periods=t + 1, freq=time_unit)
date_periods = date_range.to_period(time_unit)

pred_cum_transactions = np.array([])

# First Transactions on Each Day/Freq
first_trans_size = first_transactions.groupby(datetime_col).size()

# TODO: This loop will require additional work due to modeling differences between pymc-marketing and lifetimes
#       Will need to create a dataframe with unique T for each customer
for i, period in enumerate(date_periods):  # index of period and its date
    if i % time_scaler == 0 and i > 0:
        # Periods before the one being evaluated
        times = numpy.array([d.n for d in period - first_trans_size.index])
        times = times[times > 0].astype(float) / time_scaler

        pred_data = pd.DataFrame(
            {
                "customer_id": times,
                "t":  times,
            }
        )

        expected_trans_agg = pnbd.expected_purchases_new_customer(pred_data).mean(dim=("chain","draw"))
        
        # Mask for the number of customers with 1st transactions up to the period
        mask = first_trans_size.index < period
        masked_first_trans = first_trans_size[mask].values
        # ``expected_trans`` is a float with the cumulative sum of expected transactions
        expected_trans = (expected_trans_agg * masked_first_trans).sum()

        pred_cum_transactions = np.append(pred_cum_transactions, expected_trans.values)

act_trans = repeated_transactions.groupby(datetime_col).size()
act_tracking_transactions = act_trans.reindex(date_periods, fill_value=0)

act_cum_transactions = []

for j in range(1, t // time_scaler + 1):
    sum_trans = sum(act_tracking_transactions.iloc[: j * time_scaler])
    act_cum_transactions.append(sum_trans)

if set_index_date:
    index = date_periods[time_scaler - 1 : -1 : time_scaler]
else:
    index = range(0, t // time_scaler)

df_cum_transactions = pd.DataFrame(
        {"actual": act_cum_transactions, "predicted": pred_cum_transactions},
        index=index,
    )


df_cum_transactions.info()


In [None]:
expected_trans_agg.mean(dim=("chain","draw"))

In [None]:

first_trans_size[mask].values