In [9]:
import random
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

# Content types with sizes and base latency (ms)
CONTENT = {
    'html': {'size': 0.5, 'latency_cache': 20, 'latency_origin': 80},
    'video': {'size': 5, 'latency_cache': 40, 'latency_origin': 200},
    'image': {'size': 2, 'latency_cache': 25, 'latency_origin': 100},
}

# Define user clusters with approximate lat/long centers and radius (in degrees)
USER_CLUSTERS = {
    'Budapest': (47.4979, 19.0402, 0.05),
    'Debrecen': (47.5316, 21.6273, 0.05),
    'Szeged': (46.2530, 20.1414, 0.05),
    'Miskolc': (48.1030, 20.7784, 0.05),
}

def generate_user_location(user_id):
    # Assign user to a cluster based on modulo for simplicity
    clusters = list(USER_CLUSTERS.values())
    center_lat, center_lon, radius = clusters[user_id % len(clusters)]
    # Random location within radius of cluster center
    lat = center_lat + random.uniform(-radius, radius)
    lon = center_lon + random.uniform(-radius, radius)
    return f"{lat:.5f},{lon:.5f}"

def get_request_intensity(hour):
    # Simple traffic pattern: peak hours 8-10, 18-20; lower otherwise
    if 8 <= hour <= 10 or 18 <= hour <= 20:
        return 1.5  # 50% more requests
    elif 0 <= hour <= 5:
        return 0.5  # 50% fewer requests
    else:
        return 1.0  # normal

def simulate_request(user_id, content_type, timestamp):
    # Simulate cache hit probability
    hit = random.random() < 0.7  
    source = 'cache' if hit else 'origin'

    # Base latency
    base_latency = CONTENT[content_type][f'latency_{source}']

    # Add random network variability +/- 10ms
    latency = base_latency + random.uniform(-10, 10)
    latency = max(latency, 0)  # no negative latency

    size = CONTENT[content_type]['size']
    user_location = generate_user_location(user_id)

    return {
        'timestamp': timestamp,
        'user_id': user_id,
        'content_type': content_type,
        'is_cache_hit': hit,                  
        'cache_status': source,               
        'latency_ms': latency,
        'data_mb': size,
        'user_location': user_location
    }

def generate_data(n_users=100, duration_secs=3600):
    start_time = datetime.now()
    results = []
    for sec in range(duration_secs):
        current_time = start_time + timedelta(seconds=sec)
        hour = current_time.hour
        intensity = get_request_intensity(hour)

        # Adjust number of active users based on intensity (scaled)
        active_users = int(n_users * intensity)

        timestamp = current_time.isoformat()
        for user_id in range(active_users):
            content_type = random.choices(
                list(CONTENT.keys()), weights=[0.2, 0.6, 0.2]
            )[0]  
            request = simulate_request(user_id, content_type, timestamp)
            results.append(request)

    return pd.DataFrame(results)

# Save data if run as script
if __name__ == "__main__":
    df = generate_data()
    df.to_csv("output/cdn_requests.csv", index=False)
    print("Data saved to output/cdn_requests.csv")


Data saved to output/cdn_requests.csv
