In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

num_rows = 20000

# Generate session IDs
session_ids = [f'sess_{i}' for i in range(num_rows)]

# Generate user IDs (with repeats for returning users)
user_ids = [f'user_{random.randint(1, 5000)}' for _ in range(num_rows)]

# Generate timestamps over a year
start_date = datetime(2024, 1, 1)
dates = [start_date + timedelta(days=random.randint(0, 365), hours=random.randint(0, 23), minutes=random.randint(0, 59)) for _ in range(num_rows)]

# Traffic sources with weighted distribution
traffic_sources = random.choices(['organic', 'direct', 'paid', 'social', 'referral'], weights=[0.4, 0.2, 0.15, 0.15, 0.1], k=num_rows)

# Devices with weighted distribution
devices = random.choices(['desktop', 'mobile', 'tablet'], weights=[0.5, 0.4, 0.1], k=num_rows)

# Pageviews (Poisson distribution, min 1)
pageviews = np.random.poisson(lam=5, size=num_rows) + 1

# Session durations in seconds (exponential distribution, avg ~5 min)
session_durations = np.random.exponential(scale=300, size=num_rows)

# Bounces (binomial, ~40% bounce rate)
bounces = np.random.binomial(1, 0.4, size=num_rows)

# Conversions (binomial, ~5% conversion rate)
conversions = np.random.binomial(1, 0.05, size=num_rows)

# Event counts (Poisson, avg 3 events per session)
events_counts = np.random.poisson(lam=3, size=num_rows)

# Create DataFrame
df = pd.DataFrame({
    'session_id': session_ids,
    'user_id': user_ids,
    'timestamp': dates,
    'traffic_source': traffic_sources,
    'device': devices,
    'pageviews': pageviews,
    'session_duration_seconds': session_durations,
    'bounce': bounces,
    'conversion': conversions,
    'events_count': events_counts
})

# Sort by timestamp for realism
df = df.sort_values('timestamp').reset_index(drop=True)

# Save to CSV
df.to_csv('website_traffic_dataset.csv', index=False)