In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Define time range (5 years of monthly data)
n_months = 60
dates = [datetime(2019, 1, 1) + timedelta(days=30 * i) for i in range(n_months)]

# Define base spend per channel (ensuring Video has highest ROI, Affiliates lowest)
base_spend = {
    "Video": 50000,
    "TV": 40000,
    "SEM": 30000,
    "Radio": 20000,
    "Affiliates": 10000
}

# Generate seasonality pattern (sinusoidal)
seasonality = 1 + 0.3 * np.sin(np.linspace(0, 2 * np.pi, n_months))

# Generate baseline revenue (at least 70% of total revenue)
baseline_revenue = np.random.normal(200000, 30000, n_months) * seasonality

# Generate data
data = {
    "Date": dates,
    "Competitor_Sales": np.random.normal(200000, 30000, n_months),  # Random competitor impact
    "Discounts": np.random.uniform(5000, 15000, n_months)  # Random discounts
}

# Generate spend, impressions, reach & frequency
marketing_contribution = np.zeros(n_months)
for channel, spend in base_spend.items():
    data[f"Spend_{channel}"] = spend * (0.8 + 0.4 * np.random.rand(n_months)) * seasonality  # Adding seasonality
    data[f"Impressions_{channel}"] = data[f"Spend_{channel}"] * np.random.uniform(20, 30)  # Impressions scale with spend

    # Marketing impact (ensuring total contribution is at most 30% of revenue)
    roi = {"Video": 3.0, "TV": 2.5, "SEM": 2.0, "Radio": 1.5, "Affiliates": 1.2}[channel]
    marketing_contribution += data[f"Spend_{channel}"] * roi * np.random.uniform(0.9, 1.1)

# Ensure baseline revenue is at least 70% of total revenue
total_revenue = baseline_revenue + marketing_contribution
data["Revenue"] = np.maximum(total_revenue, baseline_revenue * 1.3)  # Ensure marketing impact does not overpower baseline

# Generate conversions and revenue per conversion
data["Conversions"] = data["Revenue"] / np.random.uniform(200, 300, n_months)
data["Revenue_per_Conversion"] = data["Revenue"] / data["Conversions"]

# Pick one channel to have Reach & Frequency
selected_channel = "TV"
data[f"Reach_{selected_channel}"] = data[f"Impressions_{selected_channel}"] * np.random.uniform(0.1, 0.3)
data[f"Frequency_{selected_channel}"] = np.random.uniform(2, 5, n_months)

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("mmm_synthetic_data.csv", index=False)

# Display first few rows
df.head()


Unnamed: 0,Date,Competitor_Sales,Discounts,Spend_Video,Impressions_Video,Spend_TV,Impressions_TV,Spend_SEM,Impressions_SEM,Spend_Radio,Impressions_Radio,Spend_Affiliates,Impressions_Affiliates,Revenue,Conversions,Revenue_per_Conversion,Reach_TV,Frequency_TV
0,2019-01-01,185624.772865,7376.37544,52992.657981,1431844.0,46240.085469,1152125.0,26078.422439,653330.296848,18560.396808,549790.979142,9142.848345,220783.284907,591451.494004,2876.184311,205.63755,144717.501408,2.455708
1,2019-01-31,194430.2307,12282.163486,58801.591536,1588800.0,38600.788052,961783.4,30137.549062,755021.662876,23902.846444,708043.555672,11840.292593,285921.694671,597659.98094,2086.275396,286.472238,120808.807825,2.416482
2,2019-03-02,166809.95078,8677.831327,56522.896961,1527230.0,40419.680637,1007103.0,30607.264818,766789.227983,20325.699156,602082.280685,9458.418781,228403.741331,619231.596298,2201.398465,281.290101,126501.392247,3.922624
3,2019-04-01,164113.801278,11323.058306,56206.047596,1518669.0,36660.531658,913439.6,34347.880529,860501.091843,17602.438679,521414.606412,12969.705188,313194.970285,652381.253657,2174.808848,299.971767,114736.391338,2.54564
4,2019-05-01,224375.774672,11335.297108,47064.286919,1271661.0,46366.290764,1155270.0,35540.993916,890391.593279,26124.34998,773848.323577,9046.353997,218453.120572,606036.224898,2022.387956,299.663684,145112.485864,3.037002


In [5]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Define base spend per channel (ensuring Video has highest ROI, Affiliates lowest)
base_spend = {
    "Video": 50000,
    "TV": 40000,
    "SEM": 30000,
    "Radio": 20000,
    "Affiliates": 10000
}

# Generate seasonality pattern (sinusoidal)
seasonality = 1 + 0.3 * np.sin(np.linspace(0, 2 * np.pi, n_months))

# Generate baseline conversions (ensuring at least 80% contribution to total conversions)
baseline_conversions = np.random.normal(800, 100, n_months) * seasonality

# Generate data
data = {
    "Date": dates,
    "Competitor_Sales": np.random.normal(200000, 30000, n_months),  # Random competitor impact
    "Discounts": np.random.uniform(5000, 15000, n_months)  # Random discounts
}

# Generate spend, impressions, reach & frequency
marketing_contribution = np.zeros(n_months)
for channel, spend in base_spend.items():
    data[f"Spend_{channel}"] = spend * (0.8 + 0.4 * np.random.rand(n_months)) * seasonality  # Adding seasonality
    data[f"Impressions_{channel}"] = data[f"Spend_{channel}"] * np.random.uniform(20, 30)  # Impressions scale with spend

    # Marketing impact (ensuring total contribution is at most 20% of conversions)
    roi = {"Video": 3.0, "TV": 2.5, "SEM": 2.0, "Radio": 1.5, "Affiliates": 1.2}[channel]
    marketing_contribution += data[f"Spend_{channel}"] * roi * np.random.uniform(0.9, 1.1) / 1000  # Scale appropriately

# Ensure baseline conversions contribute at least 80% of total conversions
total_conversions = baseline_conversions + marketing_contribution
data["Conversions"] = np.maximum(total_conversions, baseline_conversions * 1.25)  # Adjust to maintain baseline dominance
baseline_adjustment = np.maximum(data["Conversions"] * 0.8, baseline_conversions)
data["Baseline_Conversions"] = baseline_adjustment  # Ensuring baseline remains dominant

# Generate revenue per conversion
data["Revenue_per_Conversion"] = np.random.uniform(200, 300, n_months)
data["Revenue"] = data["Conversions"] * data["Revenue_per_Conversion"]

# Pick one channel to have Reach & Frequency
selected_channel = "TV"
data[f"Reach_{selected_channel}"] = data[f"Impressions_{selected_channel}"] * np.random.uniform(0.1, 0.3)
data[f"Frequency_{selected_channel}"] = np.random.uniform(2, 5, n_months)

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("mmm_synthetic_data.csv", index=False)

# Display first few rows
df.head()



Unnamed: 0,Date,Competitor_Sales,Discounts,Spend_Video,Impressions_Video,Spend_TV,Impressions_TV,Spend_SEM,Impressions_SEM,Spend_Radio,Impressions_Radio,Spend_Affiliates,Impressions_Affiliates,Conversions,Baseline_Conversions,Revenue_per_Conversion,Revenue,Reach_TV,Frequency_TV
0,2019-01-01,185624.772865,7376.37544,52992.657981,1431844.0,46240.085469,1152125.0,26078.422439,653330.296848,18560.396808,549790.979142,9142.848345,220783.284907,1226.221485,980.977188,205.63755,252157.181463,144717.501408,2.455708
1,2019-01-31,194430.2307,12282.163486,58801.591536,1588800.0,38600.788052,961783.4,30137.549062,755021.662876,23902.846444,708043.555672,11840.292593,285921.694671,1206.805679,965.444543,286.472238,345716.323182,120808.807825,2.416482
2,2019-03-02,166809.95078,8677.831327,56522.896961,1527230.0,40419.680637,1007103.0,30607.264818,766789.227983,20325.699156,602082.280685,9458.418781,228403.741331,1305.493778,1044.395023,281.290101,367222.476659,126501.392247,3.922624
3,2019-04-01,164113.801278,11323.058306,56206.047596,1518669.0,36660.531658,913439.6,34347.880529,860501.091843,17602.438679,521414.606412,12969.705188,313194.970285,1425.572464,1140.457971,299.971767,427631.491532,114736.391338,2.54564
4,2019-05-01,224375.774672,11335.297108,47064.286919,1271661.0,46366.290764,1155270.0,35540.993916,890391.593279,26124.34998,773848.323577,9046.353997,218453.120572,1261.991827,1009.593461,299.663684,378173.119544,145112.485864,3.037002
