In [1]:
import random
import uuid
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from google.colab import files


**Data Generator**

In [2]:
def make_synthetic_email_data(n_rows: int = 780, seed: int = 42) -> pd.DataFrame:
    random.seed(seed)
    np.random.seed(seed)

    themes = [
        ("AI marketing", ["gen ai", "automation", "llm", "productivity", "copilot"]),
        ("Measurement & analytics", ["ga4", "attribution", "incrementality", "kpi", "dashboards"]),
        ("Brand & creative", ["storytelling", "creative", "brand lift", "cannes", "craft"]),
        ("Retail & commerce", ["shopping", "retail", "ecommerce", "seasonal", "promotions"]),
        ("App growth", ["mobile", "app", "retention", "lifecycle", "engagement"]),
        ("Search & performance", ["search", "bidding", "pmax", "roi", "conversion"]),
    ]

    segments = ["US", "EMEA", "APAC", "SMB", "Enterprise"]
    tones = ["informational", "thought_leadership", "promotional", "how_to", "case_study"]

    base_date = datetime(2023, 1, 1)  # multi-year start

    subject_templates = {
        "informational": [
            "New insights on {k1}",
            "What’s changing in {k1}",
            "Key trends in {k1} you should know",
        ],
        "thought_leadership": [
            "Why {k1} matters in 2025",
            "A new perspective on {k1}",
            "The future of {k1}",
        ],
        "promotional": [
            "Unlock better {k1} results",
            "Get the {k1} playbook",
            "Don’t miss our latest on {k1}",
        ],
        "how_to": [
            "How to improve {k1} in 5 steps",
            "A practical guide to {k1}",
            "Checklist: optimizing your {k1}",
        ],
        "case_study": [
            "Case study: winning with {k1}",
            "How teams improved {k1}",
            "Inside a successful {k1} strategy",
        ],
    }

    rows = []
    for _ in range(n_rows):
        theme_name, keywords = random.choice(themes)
        tone = random.choice(tones)
        segment = random.choice(segments)

        k1 = random.choice(keywords)
        k2 = random.choice(keywords)

        subject = random.choice(subject_templates[tone]).format(k1=k1.upper())
        preheader = f"{theme_name} insights for {segment} teams"

        body_text = (
            f"This email explores {theme_name} with a focus on {k1} and {k2}. "
            f"It highlights what’s changing, why it matters, and how teams are applying these ideas. "
            f"Recommended for {segment} audiences looking to improve marketing effectiveness."
        )

        # ~2.5 years coverage
        send_date = base_date + timedelta(days=random.randint(0, 900))

        base_open = {
            "informational": 0.27,
            "thought_leadership": 0.25,
            "promotional": 0.31,
            "how_to": 0.29,
            "case_study": 0.26,
        }[tone]

        base_click = {
            "informational": 0.035,
            "thought_leadership": 0.028,
            "promotional": 0.030,
            "how_to": 0.042,
            "case_study": 0.033,
        }[tone]

        if theme_name in ["AI marketing", "Measurement & analytics"]:
            base_click += 0.005

        if segment == "Enterprise":
            base_open -= 0.01
            base_click += 0.002
        elif segment == "SMB":
            base_open += 0.005

        open_rate = float(np.clip(np.random.normal(base_open, 0.03), 0.05, 0.60))
        click_rate = float(np.clip(np.random.normal(base_click, 0.012), 0.002, 0.20))
        unsubscribe_rate = float(np.clip(np.random.normal(0.0025, 0.001), 0.0001, 0.02))

        rows.append({
            "campaign_id": f"cmp_{uuid.uuid4().hex[:10]}",
            "send_date": send_date.date().isoformat(),
            "audience_segment": segment,
            "tone_seed": tone,
            "subject": subject,
            "preheader": preheader,
            "body_text": body_text,
            "open_rate": round(open_rate, 4),
            "click_rate": round(click_rate, 4),
            "unsubscribe_rate": round(unsubscribe_rate, 5),
            "category_seed": theme_name,  # optional: seed category
        })

    return pd.DataFrame(rows)

**Generate + Save + Download**

In [3]:
df = make_synthetic_email_data(n_rows=780, seed=42)
print(df.shape)
df.head()

(780, 11)


Unnamed: 0,campaign_id,send_date,audience_segment,tone_seed,subject,preheader,body_text,open_rate,click_rate,unsubscribe_rate,category_seed
0,cmp_b7f4ccc86c,2023-05-23,US,informational,New insights on PMAX,Search & performance insights for US teams,This email explores Search & performance with ...,0.2849,0.0333,0.00315,Search & performance
1,cmp_f57b010b9c,2023-02-02,Enterprise,informational,What’s changing in SEARCH,Search & performance insights for Enterprise t...,This email explores Search & performance with ...,0.3057,0.0342,0.00227,Search & performance
2,cmp_92d1ba18e7,2023-01-28,EMEA,informational,Key trends in AUTOMATION you should know,AI marketing insights for EMEA teams,This email explores AI marketing with a focus ...,0.3174,0.0492,0.00203,AI marketing
3,cmp_ced176138c,2024-08-26,Enterprise,thought_leadership,A new perspective on LIFECYCLE,App growth insights for Enterprise teams,This email explores App growth with a focus on...,0.2563,0.0244,0.00203,App growth
4,cmp_583bd98902,2023-06-09,EMEA,informational,What’s changing in CANNES,Brand & creative insights for EMEA teams,This email explores Brand & creative with a fo...,0.2773,0.012,0.00078,Brand & creative


In [4]:
out_csv = "email_campaigns_synth_780.csv"
df.to_csv(out_csv, index=False)

from google.colab import files
files.download(out_csv)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>