In [2]:
import os
import json

from pathlib import Path
from datetime import datetime

import pytz
import matplotlib.pyplot as plt
from matplotlib import rcParams
from collections import defaultdict
import numpy as np

webMsms_dir = "../webMsms"

plt.rc("text", usetex=True)

rcParams["font.family"] = "CMU Sans Serif"
rcParams["font.size"] = 10

cols = plt.get_cmap("tab10").colors

country_mapping = {
    "Lusaka-NBO": "ZM (NBO PoP)",
    "Lusaka-JNB": "ZM (JNB PoP)",
    "Accra": "GH (LOS PoP)",
    "Berlin": "DE (FRA PoP)",
    "Vancouver": "CA (SEA PoP)",
    "Calgary": "CA (YYC PoP)",
    "Toronto": "CA (YUL PoP)",
}
unique_cities = ["Lusaka", "Accra", "Berlin", "Vancouver", "Calgary", "Toronto"]
hit_statuses = {"HIT", "REVALIDATED"}
miss_statuses = {"MISS", "DYNAMIC", "EXPIRED", "BYPASS"}

In [3]:
def parse_allinone(filename: str):
    data = json.load(open(filename, "r"))
    timestamps = filename.split("_")[-1].split(".")[0]
    timestamps = datetime.strptime(timestamps, "%Y%m%dT%H%M%SZ").replace(
        tzinfo=pytz.UTC
    )
    user_details = data["user_details"]
    city = user_details["City"]
    country = user_details["Country"]
    cache_result = []
    for domain, result in data["web_measurements"].items():
        curl_results = result["curl"]
        for ip, curl_resp in curl_results.items():
            initconnect_time = curl_resp.get("initconnect_time")
            appconnect_time = curl_resp.get("appconnect_time")
            pretransfer_time = curl_resp.get("pretransfer_time")
            redirect_time = curl_resp.get("redirect_time")
            starttransfer_time = curl_resp.get("starttransfer_time")
            total_time = curl_resp.get("total_time")
            cdn_server_id_key = curl_resp.get("cdn_server_id_key")
            cdn_server_id_value = curl_resp.get("cdn_server_id_value")
            cf_cache_status = curl_resp.get("cf_cache_status")
            x_cache = curl_resp.get("x_cache")
            x_cache_remote = curl_resp.get("x_cache_remote")

            r = {
                "domain": domain,
                "timestamp": timestamps,
                "ip": ip,
                "city": city,
                "country": country,
                "initconnect_time": initconnect_time,
                "appconnect_time": appconnect_time,
                "pretransfer_time": pretransfer_time,
                "redirect_time": redirect_time,
                "starttransfer_time": starttransfer_time,
                "total_time": total_time,
                "cdn_server_id_key": cdn_server_id_key,
                "cdn_server_id_value": cdn_server_id_value,
                "cf_cache_status": cf_cache_status,
                "x_cache": x_cache,
                "x_cache_remote": x_cache_remote,
            }
            cache_result.append(r)
    return cache_result, timestamps

In [4]:
def plot(daily_city_hitrates):
    data_mean = {}
    data_std = {}

    ordered_cities = [
        "Vancouver",
        "Lusaka-NBO",
        "Lusaka-JNB",
        "Accra",
        "Berlin",
        "Calgary",
        "Toronto",
    ]
    for city in ordered_cities:
        daily_rates = daily_city_hitrates.get(city, [])
        if len(daily_rates) == 0:
            mean_hit = 0
            std_hit = 0
        else:
            mean_hit = sum(daily_rates) / len(daily_rates)
            std_hit = np.std(daily_rates) if len(daily_rates) > 1 else 0
        data_mean[city] = mean_hit
        data_std[city] = std_hit

    for _, city in enumerate(ordered_cities):
        mean_hit = data_mean[city]
        std_hit = data_std[city]
        print(f"{country_mapping[city]} mean={mean_hit:.3f} std={std_hit:.3f}")

In [5]:
filename = []
path = Path(webMsms_dir)
for dirpath, dirnames, files in os.walk(path):
    if len(files) != 0:
        for f in files:
            if f.endswith(".json"):
                filename.append(Path(dirpath).joinpath(f))

per_day_counts = {
    "Vancouver": defaultdict(lambda: {"hit": 0, "miss": 0}),
    "Lusaka-NBO": defaultdict(lambda: {"hit": 0, "miss": 0}),
    "Lusaka-JNB": defaultdict(lambda: {"hit": 0, "miss": 0}),
    "Accra": defaultdict(lambda: {"hit": 0, "miss": 0}),
    "Berlin": defaultdict(lambda: {"hit": 0, "miss": 0}),
    "Calgary": defaultdict(lambda: {"hit": 0, "miss": 0}),
    "Toronto": defaultdict(lambda: {"hit": 0, "miss": 0}),
}

for i in range(len(filename)):
    per_file_results, timestamp = parse_allinone(str(filename[i]))
    if per_file_results is None:
        continue

    for j in range(len(per_file_results)):
        domain = per_file_results[j]["domain"]
        city = per_file_results[j]["city"]

        if city not in unique_cities:
            continue

        linestyle = "-"

        if city == "Lusaka":
            if timestamp < datetime(2025, 4, 1, tzinfo=pytz.UTC):
                city = "Lusaka-NBO"
            else:
                city = "Lusaka-JNB"
                linestyle = "dotted"

        cdn_server_id_key = per_file_results[j]["cdn_server_id_key"]
        cdn_server_id_value = per_file_results[j]["cdn_server_id_value"]

        if cdn_server_id_key == "cf-ray":
            if city == "Calgary" and "YYC" not in cdn_server_id_value:
                continue
            if city == "Toronto" and "YUL" not in cdn_server_id_value:
                continue
            cache_status = per_file_results[j]["cf_cache_status"]

            date_key = per_file_results[j]["timestamp"].date()
            if cache_status in hit_statuses:
                per_day_counts[city][date_key]["hit"] += 1
            elif cache_status in miss_statuses:
                per_day_counts[city][date_key]["miss"] += 1

daily_city_hitrates = {
    "Vancouver": [],
    "Lusaka-NBO": [],
    "Lusaka-JNB": [],
    "Accra": [],
    "Berlin": [],
    "Calgary": [],
    "Toronto": [],
}
for city, day_map in per_day_counts.items():
    for day, counts in day_map.items():
        total = counts["hit"] + counts["miss"]
        if total > 0:
            daily_city_hitrates[city].append(counts["hit"] / total)

plot(daily_city_hitrates)

CA (SEA PoP) mean=0.950 std=0.033
ZM (NBO PoP) mean=0.646 std=0.076
ZM (JNB PoP) mean=0.937 std=0.055
GH (LOS PoP) mean=0.869 std=0.117
DE (FRA PoP) mean=0.959 std=0.048
CA (YYC PoP) mean=0.973 std=0.014
CA (YUL PoP) mean=0.981 std=0.012
