In [5]:
import numpy as np
import pandas as pd
from pymc_marketing.clv import utils
from pymc_marketing.clv import ParetoNBDModel
from pymc_marketing.prior import Prior

In [6]:
import pytensor

#set flag to fix open issue
pytensor.config.cxx = '/usr/bin/clang++'

Create a simple dataset for testing:

In [7]:
d = [
    [1, "2015-01-01", 1],
    [1, "2015-02-06", 2],
    [2, "2015-01-01", 2],
    [3, "2015-01-01", 3],
    [3, "2015-01-02", 1],
    [3, "2015-01-05", 5],
    [4, "2015-01-16", 6],
    [4, "2015-02-02", 3],
    [4, "2015-02-05", 3],
    [4, "2015-02-05", 2],
    [5, "2015-01-16", 3],
    [5, "2015-01-17", 1],
    [5, "2015-01-18", 8],
    [6, "2015-02-02", 5],
]
test_data = pd.DataFrame(d, columns=["id", "date", "monetary_value"])

Note customer 4 made two purchases on 2015-02-05. 

`_find_first_transactions` flags the first purchase each customer has made, which must be excluded for modeling. It is called internally by `rfm_summary`.

In [8]:
utils._find_first_transactions(
    transactions=test_data, 
    customer_id_col = "id", 
    datetime_col = "date",
    #monetary_value_col = "monetary_value", 
    #datetime_format = "%Y%m%d",
).reindex()

Unnamed: 0,id,date,first
0,1,2015-01-01,True
1,1,2015-02-06,False
2,2,2015-01-01,True
3,3,2015-01-01,True
4,3,2015-01-02,False
5,3,2015-01-05,False
6,4,2015-01-16,True
7,4,2015-02-02,False
8,4,2015-02-05,False
10,5,2015-01-16,True


Notice how **9** is missing from the dataframe index. Multiple transactions in the same time period are treated as a single purchase, so the indices for those additional transactions are skipped. 

`rfm_summary` is the primary data preprocessing step for CLV modeling in the continuous, non-contractual domain:

In [9]:
rfm_df = utils.rfm_summary(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    monetary_value_col = "monetary_value",
    observation_period_end = "2015-02-06",
    datetime_format = "%Y-%m-%d",
    time_unit = "W",
    include_first_transaction=True,
)

rfm_df.head()

Unnamed: 0,customer_id,frequency,recency,monetary_value
0,1,2.0,0.0,1.5
1,2,1.0,5.0,2.0
2,3,2.0,4.0,4.5
3,4,2.0,0.0,7.0
4,5,1.0,3.0,12.0


For MAP fits and covariate models, `rfm_train_test_split` can be used to evaluate models on unseen data. It is also useful for identifying the impact of a time-based event like a marketing campaign.

In [10]:
train_test = utils.rfm_train_test_split(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    train_period_end = "2015-02-01",
    monetary_value_col = "monetary_value",
)

train_test.head()

Unnamed: 0,customer_id,frequency,recency,T,monetary_value,test_frequency,test_monetary_value,test_T
0,1,0.0,0.0,31.0,0.0,1.0,2.0,5.0
1,2,0.0,0.0,31.0,0.0,0.0,0.0,5.0
2,3,2.0,4.0,31.0,3.0,0.0,0.0,5.0
3,4,0.0,0.0,16.0,0.0,2.0,4.0,5.0
4,5,2.0,2.0,16.0,4.5,0.0,0.0,5.0


`rfm_segments` will assign customer to segments based on their recency, frequency, and monetary value. It uses a quartile-based RFM score approach that is very computationally efficient, but defining custom segments is a rather subjective exercise. The returned dataframe also cannot be used for modeling because it does not zero out the initial transactions.

In [11]:
segments = utils.rfm_segments(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    monetary_value_col = "monetary_value",
    observation_period_end = "2015-02-06",
    datetime_format = "%Y-%m-%d",
    time_unit = "W",
)



### Functions Requiring a Fitted Model
`expected_cumulative_transactions` and all plotting functions require a fitted model:

In [119]:
url_cdnow = "https://raw.githubusercontent.com/pymc-labs/pymc-marketing/main/data/cdnow_transactions.csv"
raw_trans = pd.read_csv(url_cdnow)

rfm_data = utils.rfm_summary(
    raw_trans, 
    customer_id_col = "id", 
    datetime_col = "date", 
    datetime_format = "%Y%m%d",
    time_unit = "W",
    observation_period_end = "19970930",
    #time_scaler = 7,
)

pnbd = ParetoNBDModel(data=rfm_data)
#pnbd.fit()

In [198]:
import pymc as pm
import xarray as xr

with pnbd.model:
    prior_idata = pm.sample_prior_predictive(random_seed=45, samples=100)

# obs_var must be obtained from  prior_idata in case of an unfit model
obs_freq = prior_idata.observed_data["recency_frequency"].sel(obs_var="frequency")
# this is a redundant line when using arviz.hdi
ppc_freq = prior_idata.prior_predictive["recency_frequency"].sel(obs_var="frequency").mean(("chain","draw"))

# TODO: Resolve this merging. May need to rename a var somewhere
#xr.merge([obs_freq,ppc_freq],compat='override')

Sampling: [alpha, beta, r, recency_frequency, s]


In [157]:
obs_freq.to_pandas().value_counts().sort_index()

0.0     1422
1.0      443
2.0      221
3.0       98
4.0       56
5.0       35
6.0       37
7.0       15
8.0        4
9.0        5
10.0       3
11.0       7
12.0       3
13.0       3
14.0       1
15.0       1
16.0       2
23.0       1
Name: count, dtype: int64

In [200]:
from arviz.stats import hdi

calc_hdi = hdi(
    ary=prior_idata,
    hdi_prob=.9, #param
    group='prior_predictive', #posterior
).sel(obs_var='frequency')

hdi_lo = calc_hdi.sel(hdi='lower').to_array().squeeze()
hdi_hi = calc_hdi.sel(hdi='higher').to_array().squeeze()

# TODO: Join back to ppc_freq & obs var
hdi_hi

In [174]:
hdi_hi.to_pandas().value_counts().sort_index()

2.0       1
3.0      17
4.0     123
5.0     402
6.0     534
7.0     561
8.0     371
9.0     189
10.0     92
11.0     40
12.0     13
13.0     11
14.0      2
15.0      1
Name: count, dtype: int64

In [83]:
obs_freq = pnbd.idata.observed_data["recency_frequency"].sel(obs_var="frequency")
ppc_freq = pnbd.distribution_new_customer_recency_frequency(rfm_data,random_seed=42,n_samples=100).sel(obs_var="frequency")

ppc_freq

Sampling: [recency_frequency]


Output()

### `expected_cumulative_transactions`

In [None]:
df_cum = utils._expected_cumulative_transactions(
    model=pnbd,
    transactions=raw_trans,
    customer_id_col="id",
    datetime_col="date",
    t=25*7,
    datetime_format="%Y%m%d",
    time_unit="W",
    set_index_date=True,
)

df_cum.info()

In [10]:
df_cum

Unnamed: 0,actual,predicted
1996-12-30/1997-01-05,0,4.232350
1997-01-06/1997-01-12,3,15.415502
1997-01-13/1997-01-19,17,33.593211
1997-01-20/1997-01-26,44,59.287376
1997-01-27/1997-02-02,67,92.716297
...,...,...
2000-04-03/2000-04-09,4004,6563.077204
2000-04-10/2000-04-16,4004,6586.673815
2000-04-17/2000-04-23,4004,6610.195558
2000-04-24/2000-04-30,4004,6633.643085
