# Meridian Geo-Level MMM

In this post, we build a geo-level Marketing Mix Model (MMM) using `meridian` on a large-scale dataset to evaluate its performance in estimating true marketing contributions. We'll compare the model's inferred contributions against known ground truth, providing a realistic assessment of its accuracy. Additionally, we profile the model's runtime and memory usage to understand its scalability in production-like settings.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
import arviz as az
import seaborn as sns
import matplotlib.pyplot as plt

import IPython

from meridian import constants
from meridian.data import load
from meridian.data import test_utils
from meridian.data import data_frame_input_data_builder
from meridian.model import model
from meridian.model import spec
from meridian.model import prior_distribution
from meridian.analysis import optimizer
from meridian.analysis import analyzer
from meridian.analysis import visualizer
from meridian.analysis import summarizer
from meridian.analysis import formatter

from pymc_marketing.paths import data_dir

import warnings
warnings.simplefilter("ignore")

In [2]:
seed: int = sum(map(ord, "mmm_multidimensional"))
rng: np.random.Generator = np.random.default_rng(seed=seed)

### The Data

To be replaced with a larger dataset, generated by Luca.

In [3]:
data_path = data_dir / "mmm_example.csv"

raw_data_df = pd.read_csv(data_path, parse_dates=["date_week"]).rename(columns={"date_week": "date"})

a_data_df = raw_data_df.copy().assign(geo="geo_a")
b_data_df = raw_data_df.copy().assign(geo="geo_b")

# Add noise to the target variable for the second geo
b_data_df["y"] = b_data_df["y"] + 500 * rng.normal(size=len(b_data_df))

# Concatenate the two datasets
data_df = pd.concat([a_data_df, b_data_df])

# We want all geos scaled equally.
data_df['population'] = 1

# Naming convention for Meridian
data_df = data_df.rename(columns = {"date": "time"})

data_df.head()

Unnamed: 0,time,y,x1,x2,event_1,event_2,dayofyear,t,geo,population
0,2018-04-02,3984.662237,0.31858,0.0,0.0,0.0,92,0,geo_a,1
1,2018-04-09,3762.871794,0.112388,0.0,0.0,0.0,99,1,geo_a,1
2,2018-04-16,4466.967388,0.2924,0.0,0.0,0.0,106,2,geo_a,1
3,2018-04-23,3864.219373,0.071399,0.0,0.0,0.0,113,3,geo_a,1
4,2018-04-30,4441.625278,0.386745,0.0,0.0,0.0,120,4,geo_a,1


In [4]:
builder = (
    data_frame_input_data_builder.DataFrameInputDataBuilder(kpi_type='revenue')
    .with_kpi(data_df, kpi_col="y")
    .with_population(data_df)
    .with_controls(
        data_df, control_cols=["event_1", "event_2"]
    )
    .with_media(
    data_df,
    media_cols=["x1", "x2"],
    media_spend_cols=["x1", "x2"],
    media_channels=["x1", "x2"],
    )
)

data = builder.build()

### Prior Specification 

- Using spend shares as prior for beta parameters, independent across all geo's.
- Hierarchical structure accross the saturation parameters - Meridian does by default.
- Setting knots to occur every 26 weeks, to best align with seasonality of order 2.

In [5]:
channel_columns = ["x1", "x2"]
n_channels = len(channel_columns)

sum_spend_geo_channel = data_df.groupby(["geo"]).agg({"x1": "sum", "x2": "sum"})

spend_share = (
    sum_spend_geo_channel.to_numpy() / sum_spend_geo_channel.sum(axis=1).to_numpy()
)

prior_sigma = n_channels * spend_share

In [20]:
prior_sigma

array([[1.31263903, 0.68736097],
       [1.31263903, 0.68736097]])

In [6]:
n_time = len(data.time)
knots = np.arange(0, n_time, 26).tolist() # ~ seasonality of order 2

In [7]:
my_input_data = input_data.InputData( ... )
build_media_channel_args = my_input_data.get_paid_media_channels_argument_builder()

beta_m = build_media_channel_args(
  x1=,
  x2=,
) # This creates a list of channel-ordered (mu, sigma) tuples.
roi_m_mu, roi_m_sigma = zip(*roi_m)

prior = prior_distribution.PriorDistribution(
    roi_m=tfp.distributions.LogNormal(
        roi_m_mu, roi_m_sigma, name=constants.ROI_M
    )
)
model_spec = spec.ModelSpec(prior=prior)

prior = prior_distribution.PriorDistribution(
    beta_m = tfp.distributions.HalfNormal(
        prior_sigma, 
        name = constants.BETA_M, # or BETA_GM
    )
)

In [9]:
model_spec = spec.ModelSpec(
    prior=prior_distribution.PriorDistribution(),
    media_effects_dist='log_normal',
    hill_before_adstock=False,
    max_lag=8,
    unique_sigma_for_each_geo=True,
    roi_calibration_period=None,
    rf_roi_calibration_period=None,
    knots=knots,
    baseline_geo=None,
    holdout_id=None,
    control_population_scaling_id=None,
    media_prior_type = 'coefficient',
    rf_prior_type = 'coefficient',
)

In [10]:
meridian = model.Meridian(input_data=data, model_spec=model_spec)
meridian.sample_prior(1000)
#meridian.sample_posterior(n_chains=4, n_adapt=1000, n_burnin=500, n_keep=1000)

I0000 00:00:1751361728.786180 9535117 service.cc:148] XLA service 0x60000141c200 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1751361728.786209 9535117 service.cc:156]   StreamExecutor device (0): Host, Default Version
I0000 00:00:1751361728.794673 9535117 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


In [19]:
meridian.inference_data