<a href="https://colab.research.google.com/github/ryan-miles/stellationharness/blob/main/SyntheticMarketingDataGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import random
import datetime
import uuid
import json
import csv
from io import StringIO # To build CSVs in memory

# --- Configuration ---
OUTPUT_DIR = "acme_co_synthetic_marketing_data"
NUM_FILES_TO_CREATE = 1000 # You want 1K+

COMPANY_INFO = {
    "name": "Acme Co.",
    "tagline": "Innovating for a Brighter Future.",
    "pitch": "Acme Co. delivers cutting-edge solutions for modern challenges."
}

SERVICES = {
    "S1": {"name": "Strategic Consulting", "desc": "Expert advice to drive your business forward."},
    "S2": {"name": "Technology Solutions", "desc": "Implementing the latest tech for optimal performance."},
    "S3": {"name": "Customer Support", "desc": "Dedicated support to ensure your success."}
}

CALLS_TO_ACTION = [
    "Learn More",
    "Contact Us for a Consultation",
    "Visit Our Website Today!",
    "Discover How Acme Co. Can Help You",
    "Explore Our Services",
    "Get Started with Acme Co."
]

MARKETING_PHRASES = [
    "Unlock your potential.",
    "Driving innovation.",
    "Your success is our priority.",
    "Partner with the best.",
    "Future-proof your business.",
    "Solutions tailored for you."
]

FILE_FORMATS_WEIGHTED = ["txt"] * 40 + ["md"] * 20 + ["json"] * 20 + ["csv"] * 15 + ["html"] * 5 # Weighted towards txt

# --- Helper Functions ---

def get_random_date_string():
    days_offset = random.randint(-365, 30) # From past year to next month
    date = datetime.date.today() + datetime.timedelta(days=days_offset)
    return date.strftime("%Y-%m-%d")

def generate_filename(file_format):
    """Generates a somewhat descriptive random filename."""
    prefix = COMPANY_INFO["name"].lower().replace(" ", "_")
    theme = random.choice(["campaign_ideas", "service_info", "internal_memo", "web_content", "social_draft"])
    service_key = random.choice(list(SERVICES.keys()) + ["general"])
    rand_id = uuid.uuid4().hex[:6]
    return f"{prefix}_{theme}_{service_key}_{get_random_date_string()}_{rand_id}.{file_format}"

def generate_text_content():
    """Generates a block of text for TXT, MD, or parts of HTML."""
    lines = []
    # Company Intro (likely)
    if random.random() < 0.8:
        lines.append(f"{COMPANY_INFO['name']} - {COMPANY_INFO['tagline']}")
    if random.random() < 0.6:
        lines.append(COMPANY_INFO['pitch'])

    # Service Mentions (0 to 3 services)
    num_services = random.randint(0, len(SERVICES))
    chosen_services = random.sample(list(SERVICES.keys()), num_services)
    for s_key in chosen_services:
        service = SERVICES[s_key]
        lines.append(f"\nService Highlight: {service['name']} ({s_key})")
        lines.append(service['desc'])
        if random.random() < 0.5: # Alternate phrasing
             lines.append(f"With {service['name']}, we help you by {service['desc'].lower().replace('expert advice to', 'providing expert advice to').replace('implementing the latest tech for', 'ensuring').replace('dedicated support to ensure', 'guaranteeing')}")


    # Marketing Phrases & CTAs
    for _ in range(random.randint(1, 3)):
        lines.append(random.choice(MARKETING_PHRASES))
    if random.random() < 0.9:
        lines.append(f"\nCTA: {random.choice(CALLS_TO_ACTION)}")

    # Random filler
    lines.append(f"\nGenerated on: {datetime.datetime.now().isoformat()}")
    lines.append(f"Internal Ref: {uuid.uuid4().hex[:8]}")
    if random.random() < 0.2:
        lines.append(f"Campaign Idea: {random.choice(['SpringForward', 'TechRefresh2025', 'ClientSuccessInitiative'])}")

    return "\n".join(lines)

def generate_md_content():
    """Generates Markdown formatted content."""
    content = f"# {COMPANY_INFO['name']}\n"
    content += f"*{COMPANY_INFO['tagline']}*\n\n"
    if random.random() < 0.7:
        content += f"## About Us\n{COMPANY_INFO['pitch']}\n\n"

    num_services = random.randint(1, len(SERVICES)) # Usually mention at least one
    chosen_services = random.sample(list(SERVICES.keys()), num_services)
    if chosen_services:
        content += "## Our Services\n"
    for s_key in chosen_services:
        service = SERVICES[s_key]
        content += f"### {service['name']} ({s_key})\n"
        content += f"> {service['desc']}\n\n"

    content += f"## {random.choice(['Why Choose Us?', 'Learn More', 'Next Steps'])}\n"
    content += f"* {random.choice(MARKETING_PHRASES)}\n"
    content += f"* **{random.choice(CALLS_TO_ACTION)}**\n"
    content += f"\n_Document ID: {uuid.uuid4().hex[:10]}_"
    return content

def generate_json_content():
    """Generates JSON content."""
    doc = {
        "document_id": str(uuid.uuid4()),
        "creation_date": get_random_date_string(),
        "company_name": COMPANY_INFO["name"],
        "main_subject": "",
        "content_snippets": [],
        "keywords": [COMPANY_INFO["name"].lower()],
        "call_to_action": None
    }

    if random.random() < 0.5:
        doc["tagline"] = COMPANY_INFO["tagline"]
    if random.random() < 0.7:
        doc["company_pitch"] = COMPANY_INFO["pitch"]
        doc["keywords"].append("solutions")

    service_focus = random.choice(list(SERVICES.keys()) + [None])
    if service_focus:
        service = SERVICES[service_focus]
        doc["main_subject"] = f"Focus on {service['name']}"
        doc["content_snippets"].append({
            "type": "service_description",
            "service_id": service_focus,
            "name": service["name"],
            "details": service["desc"]
        })
        doc["keywords"].extend([service["name"].lower().replace(" ", "_"), service_focus])
    else:
        doc["main_subject"] = "General Company Information"

    for _ in range(random.randint(1,3)):
        doc["content_snippets"].append({"type": "marketing_blurb", "text": random.choice(MARKETING_PHRASES)})
        doc["keywords"].append(random.choice(MARKETING_PHRASES).split(" ")[0].lower().replace(".",""))

    if random.random() < 0.8:
        doc["call_to_action"] = random.choice(CALLS_TO_ACTION)

    return json.dumps(doc, indent=2)

def generate_csv_content():
    """Generates simple CSV content."""
    si = StringIO()
    writer = csv.writer(si)
    header = []
    rows = []

    csv_type = random.choice(["campaign_ideas", "service_keywords", "contact_points"])

    if csv_type == "campaign_ideas":
        header = ["CampaignName", "TargetService", "ProposedDate", "PrimaryMessage", "CTA"]
        for _ in range(random.randint(3, 10)):
            s_key = random.choice(list(SERVICES.keys()))
            rows.append([
                f"{s_key}_Campaign_{uuid.uuid4().hex[:4]}",
                SERVICES[s_key]["name"],
                get_random_date_string(),
                random.choice(MARKETING_PHRASES),
                random.choice(CALLS_TO_ACTION)
            ])
    elif csv_type == "service_keywords":
        header = ["ServiceID", "ServiceName", "Keyword1", "Keyword2", "Keyword3", "RelatedPhrase"]
        for s_key, service in SERVICES.items():
            rows.append([
                s_key,
                service["name"],
                service["name"].split(" ")[0].lower(),
                random.choice(["solutions", "expert", "support", "technology", "future"]),
                COMPANY_INFO["name"].lower().replace(" ", ""),
                service["desc"].split(".")[0]
            ])
    elif csv_type == "contact_points": # purely fictional for structure
        header = ["ContactID", "Date", "InquiryType", "ServiceOfInterest", "Notes"]
        for _ in range(random.randint(3,15)):
            s_key = random.choice(list(SERVICES.keys()) + ["General"])
            rows.append([
                uuid.uuid4().hex[:8],
                get_random_date_string(),
                random.choice(["WebForm", "Email", "Call"]),
                s_key,
                f"Client interested in {s_key if s_key != 'General' else 'our services'}."
            ])
    writer.writerow(header)
    writer.writerows(rows)
    return si.getvalue()

def generate_html_content():
    """Generates very simple HTML content."""
    title = f"{COMPANY_INFO['name']} - {random.choice(['Welcome', 'Our Services', 'Contact Us'])}"
    body = f"<h1>{COMPANY_INFO['name']}</h1>\n"
    body += f"<p><em>{COMPANY_INFO['tagline']}</em></p>\n"
    body += f"<p>{COMPANY_INFO['pitch']}</p>\n"

    body += "<h2>Our Services</h2>\n<ul>\n"
    for s_key, service in random.sample(list(SERVICES.items()), random.randint(1, len(SERVICES))):
        body += f"  <li><strong>{service['name']} ({s_key}):</strong> {service['desc']}</li>\n"
    body += "</ul>\n"

    body += f"<h2>{random.choice(CALLS_TO_ACTION)}</h2>\n"
    body += f"<p>{random.choice(MARKETING_PHRASES)}</p>\n"
    body += f"<p><a href='#'>Learn More Link (Placeholder)</a></p>\n"
    body += f"\n<hr><p><small>Page generated: {datetime.datetime.now().isoformat()}</small></p>"

    return f"<!DOCTYPE html>\n<html>\n<head>\n  <title>{title}</title>\n</head>\n<body>\n{body}\n</body>\n</html>"

# --- Main Script ---
if __name__ == "__main__":
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    print(f"Generating {NUM_FILES_TO_CREATE} files in '{OUTPUT_DIR}'...")

    for i in range(NUM_FILES_TO_CREATE):
        file_format = random.choice(FILE_FORMATS_WEIGHTED)
        filename = generate_filename(file_format)
        filepath = os.path.join(OUTPUT_DIR, filename)
        content = ""

        if file_format == "txt":
            content = generate_text_content()
        elif file_format == "md":
            content = generate_md_content()
        elif file_format == "json":
            content = generate_json_content()
        elif file_format == "csv":
            content = generate_csv_content()
        elif file_format == "html":
            content = generate_html_content()

        try:
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(content)
        except Exception as e:
            print(f"Error writing file {filepath}: {e}")

        if (i + 1) % 100 == 0:
            print(f"Generated {i+1}/{NUM_FILES_TO_CREATE} files...")

    print(f"Done! {NUM_FILES_TO_CREATE} files generated in '{OUTPUT_DIR}'.")

Generating 1000 files in 'acme_co_synthetic_marketing_data'...
Generated 100/1000 files...
Generated 200/1000 files...
Generated 300/1000 files...
Generated 400/1000 files...
Generated 500/1000 files...
Generated 600/1000 files...
Generated 700/1000 files...
Generated 800/1000 files...
Generated 900/1000 files...
Generated 1000/1000 files...
Done! 1000 files generated in 'acme_co_synthetic_marketing_data'.
