In [1]:
from pymc_marketing.clv import utils

import pandas as pd

Create a simple dataset for testing:

In [2]:
d = [
    [1, "2015-01-01", 1],
    [1, "2015-02-06", 2],
    [2, "2015-01-01", 2],
    [3, "2015-01-01", 3],
    [3, "2015-01-02", 1],
    [3, "2015-01-05", 5],
    [4, "2015-01-16", 6],
    [4, "2015-02-02", 3],
    [4, "2015-02-05", 3],
    [4, "2015-02-05", 2],
    [5, "2015-01-16", 3],
    [5, "2015-01-17", 1],
    [5, "2015-01-18", 8],
    [6, "2015-02-02", 5],
]
test_data = pd.DataFrame(d, columns=["id", "date", "monetary_value"])

Note customer 4 made two purchases on 2015-02-05. 

`_find_first_transactions` flags the first purchase each customer has made, which must be excluded for modeling. It is called internally by `rfm_summary`.

In [3]:
utils._find_first_transactions(
    transactions=test_data, 
    customer_id_col = "id", 
    datetime_col = "date",
    #monetary_value_col = "monetary_value", 
    #datetime_format = "%Y%m%d",
).reindex()

Unnamed: 0,id,date,first
0,1,2015-01-01,True
1,1,2015-02-06,False
2,2,2015-01-01,True
3,3,2015-01-01,True
4,3,2015-01-02,False
5,3,2015-01-05,False
6,4,2015-01-16,True
7,4,2015-02-02,False
8,4,2015-02-05,False
10,5,2015-01-16,True


Notice how **9** is missing from the dataframe index. Multiple transactions in the same time period are treated as a single purchase, so the indices for those additional transactions are skipped. 

`rfm_summary` is the primary data preprocessing step for CLV modeling in the continuous, non-contractual domain:

In [4]:
rfm_df = utils.rfm_summary(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    monetary_value_col = "monetary_value",
    observation_period_end = "2015-02-06",
    datetime_format = "%Y-%m-%d",
    time_unit = "W",
    include_first_transaction=True,
)

rfm_df.head()

Unnamed: 0,customer_id,frequency,recency,monetary_value
0,1,2.0,0.0,1.5
1,2,1.0,5.0,2.0
2,3,2.0,4.0,4.5
3,4,2.0,0.0,7.0
4,5,1.0,3.0,12.0


For MAP fits and covariate models, `rfm_train_test_split` can be used to evaluate models on unseen data. It is also useful for identifying the impact of a time-based event like a marketing campaign.

In [5]:
train_test = utils.rfm_train_test_split(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    train_period_end = "2015-02-01",
    monetary_value_col = "monetary_value",
)

train_test.head()

Unnamed: 0,customer_id,frequency,recency,T,monetary_value,test_frequency,test_monetary_value,test_T
0,1,0.0,0.0,31.0,0.0,1.0,2.0,5.0
1,2,0.0,0.0,31.0,0.0,0.0,0.0,5.0
2,3,2.0,4.0,31.0,3.0,0.0,0.0,5.0
3,4,0.0,0.0,16.0,0.0,2.0,4.0,5.0
4,5,2.0,2.0,16.0,4.5,0.0,0.0,5.0


`rfm_segments` will assign customer to segments based on their recency, frequency, and monetary value. It uses a quartile-based RFM score approach that is very computationally efficient, but defining custom segments is a rather subjective exercise. The returned dataframe also cannot be used for modeling because it does not zero out the initial transactions.

In [6]:
segments = utils.rfm_segments(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    monetary_value_col = "monetary_value",
    observation_period_end = "2015-02-06",
    datetime_format = "%Y-%m-%d",
    time_unit = "W",
)



In [7]:
segments

Unnamed: 0,customer_id,frequency,recency,monetary_value,rfm_score,segment
0,1,2.0,0.0,1.5,321,Other
1,2,1.0,5.0,2.0,111,Inactive Customer
2,3,2.0,4.0,4.5,122,At Risk Customer
3,4,2.0,0.0,7.0,324,Top Spender
4,5,1.0,3.0,12.0,214,At Risk Customer
5,6,1.0,0.0,5.0,313,Top Spender


`_expected_cumulative_transactions` is a utility function for cumulative plots over time, a useful model evaluation technique

In [19]:
from lifetimes import BetaGeoFitter
from lifetimes.datasets import load_transaction_data
from lifetimes.utils import expected_cumulative_transactions

data = load_transaction_data()
rfm_data = utils.rfm_summary(
    data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    #datetime_format = "%Y-%m-%d",
    time_unit = "W",
)

bg_lt = BetaGeoFitter()
bg_lt = bg_lt.fit(
    rfm_data["frequency"],
    rfm_data["recency"],
    rfm_data["T"],
)

expected_cumulative_transactions(
    bg_lt,
    data,
    "id",
    "date",
    52,
    freq="W",
).head(10)

Unnamed: 0,actual,predicted
0,0,59.837794
1,0,101.193989
2,0,133.137625
3,0,159.250062
4,0,181.360674
5,0,200.542027
6,0,217.481015
7,0,232.645705
8,0,246.370154
9,0,258.901499


In [25]:
from lifetimes import utils, BetaGeoFitter, ParetoNBDFitter
from lifetimes.datasets import load_dataset

def cdnow_transactions():
    transactions = load_dataset("CDNOW_sample.txt", header=None, sep=r"\s+")
    transactions.columns = ["id_total", "id_sample", "date", "num_cd_purc", "total_value"]
    return transactions[["id_sample", "date"]]

def df_cum_transactions(cdnow_transactions):
    datetime_col = "date"
    customer_id_col = "id_sample"
    t = 25 * 7
    datetime_format = "%Y%m%d"
    freq = "D"
    observation_period_end = "19970930"
    freq_multiplier = 7

    transactions_summary = utils.summary_data_from_transaction_data(
        cdnow_transactions,
        customer_id_col,
        datetime_col,
        datetime_format=datetime_format,
        freq=freq,
        freq_multiplier=freq_multiplier,
        observation_period_end=observation_period_end,
    )

    transactions_summary = transactions_summary.reset_index()

    model = ParetoNBDFitter()
    model.fit(transactions_summary["frequency"], transactions_summary["recency"], transactions_summary["T"])

    df_cum = utils.expected_cumulative_transactions(
        model,
        cdnow_transactions,
        datetime_col,
        customer_id_col,
        t,
        datetime_format,
        freq,
        set_index_date=False,
        freq_multiplier=freq_multiplier,
    )
    return df_cum

trans_df = cdnow_transactions()

df_cum_transactions(trans_df).to_csv("clv_cumulative_transactions.csv",index=False)

In [23]:
pd.read_csv("cum_trans.csv")

Unnamed: 0,actual,predicted
0,0,4.213966
1,19,16.564562
2,42,37.203477
3,81,66.701849
4,119,105.361838
5,192,153.689679
6,261,210.346282
7,351,275.339943
8,428,348.972129
9,504,430.959998
