In [1]:
import os
import random
import sqlite3
from datetime import datetime, timedelta, timezone

import pandas as pd

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

SRC_DB = "b2b_source.sqlite"
OUT_LOG = "weblogs_combined.log"

# size: big enough to show "large-ish" transform + reporting
N_LINES_INITIAL = 240_000
N_LINES_INCREMENTAL = 25_000

PATHS = [
    "/", "/login", "/catalog", "/search?q=gloves", "/search?q=labels", "/cart", "/checkout",
    "/product/123", "/product/456", "/orders", "/account", "/help"
]
METHODS = ["GET", "POST"]
STATUS_POOL = [200, 200, 200, 200, 302, 404, 500]
STATUS_W =    [0.74,0.06,0.06,0.04,0.04,0.04,0.02]

UA_POOL = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
    "Mozilla/5.0 (iPad; CPU OS 16_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
]

def connect(db_path: str) -> sqlite3.Connection:
    conn = sqlite3.connect(db_path)
    conn.execute("PRAGMA foreign_keys=ON;")
    return conn


In [2]:
src = connect(SRC_DB)

ip_geo = pd.read_sql_query("SELECT ip, country_code FROM ip_geo_map", src)
companies = pd.read_sql_query("SELECT c.company_id, c.country_code FROM companies c", src)

# We'll map "username" to something that looks like a platform user
# and keep it country-ish so the "country with most logins" is stable.
def make_username(i: int) -> str:
    return f"user_{i:06d}"

# choose a pool of active users, not 1 per log line
N_USERS = 18_000
usernames = [make_username(i) for i in range(1, N_USERS + 1)]

# bias: logged-in traffic plus anonymous (“-”)
def pick_user() -> str:
    return random.choice(usernames) if random.random() < 0.68 else "-"

ips = ip_geo["ip"].tolist()

src.close()

print("Loaded IPs:", len(ips), "Users:", len(usernames))


Loaded IPs: 557 Users: 18000


In [3]:
def fmt_apache_time(dt: datetime) -> str:
    # 10/Oct/2000:13:55:36 +0000
    return dt.strftime("%d/%b/%Y:%H:%M:%S %z")

def weighted_status() -> int:
    return random.choices(STATUS_POOL, weights=STATUS_W, k=1)[0]

def generate_lines(n_lines: int, start_dt: datetime, spread_days: int) -> list:
    lines = []
    for _ in range(n_lines):
        ip = random.choice(ips)
        ident = "-"
        user = pick_user()

        # spread over recent history
        dt = start_dt - timedelta(days=random.randint(0, spread_days), seconds=random.randint(0, 86400))
        ts = fmt_apache_time(dt)

        method = random.choice(METHODS)
        path = random.choice(PATHS)
        proto = "HTTP/1.1"
        req = f'{method} {path} {proto}'

        status = weighted_status()
        bytes_sent = random.randint(200, 50_000) if status < 500 else random.randint(0, 1500)

        referer = random.choice(["-", "https://google.com/", "https://partner.example.com/", "https://b2b.example.com/home"])
        ua = random.choice(UA_POOL)

        # Combined log format:
        # %h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-agent}i" :contentReference[oaicite:8]{index=8}
        line = f'{ip} {ident} {user} [{ts}] "{req}" {status} {bytes_sent} "{referer}" "{ua}"'
        lines.append(line)
    return lines


now = datetime.now(timezone.utc)
initial_lines = generate_lines(N_LINES_INITIAL, start_dt=now, spread_days=365)

with open(OUT_LOG, "w", encoding="utf-8") as f:
    for ln in initial_lines:
        f.write(ln + "\n")

print("Wrote log file:", OUT_LOG, "lines:", len(initial_lines))


Wrote log file: weblogs_combined.log lines: 240000


In [4]:
incr_lines = generate_lines(N_LINES_INCREMENTAL, start_dt=datetime.now(timezone.utc), spread_days=20)

with open(OUT_LOG, "a", encoding="utf-8") as f:
    for ln in incr_lines:
        f.write(ln + "\n")

print("Appended incremental lines:", len(incr_lines))


Appended incremental lines: 25000
