## Load M-Lab data

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid", context="notebook")

df = pd.read_csv("../../data/raw/mlab_ndt_us_30days_20251111_004612.csv",
                 parse_dates=["date"])

df.head()
df.columns


Index(['date', 'client_lat', 'client_lon', 'client_city', 'client_country',
       'client_asn', 'client_isp', 'server_lat', 'server_lon', 'server_site',
       'server_city', 'download_mbps', 'min_rtt_ms', 'packet_loss_rate'],
      dtype='object')

## 1. Load RIPE processed data and merge by time/region

In [4]:
ripe_path = ROOT / "notebooks" / "data" / "processed" / "ripe_processed.csv"
ripe = pd.read_csv(ripe_path, parse_dates=["date"])

ripe.head()
ripe.columns


FileNotFoundError: [Errno 2] No such file or directory: '/Users/wajihanaveed/Desktop/REC YT/cdn-multimetric-selection-main/notebooks/exploratory/notebooks/data/processed/ripe_processed.csv'

## Create a common region key. Easiest: use country.

In [None]:
# M-Lab region
mlab["region"] = mlab["client_country"]

# RIPE region (change 'country' → whatever your column is)
ripe["region"] = ripe["country"]  # or 'probe_country', etc.

# Keep only relevant columns
mlab_cols = [
    "date", "region",
    "download_mbps",    # throughput
    "min_rtt_ms",
    "packet_loss_rate",
]

ripe_cols = [
    "date", "region",
    "rtt_ms",           # RTT measure from RIPE (if duplicated you can drop later)
    "ttfb_ms"           # <-- make sure this is your TTFB column name
]

mlab_sub = mlab[mlab_cols]
ripe_sub = ripe[ripe_cols]

merged = pd.merge(
    mlab_sub,
    ripe_sub,
    on=["date", "region"],
    how="inner"        # inner join: only pairs where both have data
)

merged.head()
len(merged)


## 2. Build features & clean up
We want:
Throughput (target)
RTT, TTFB, loss (features)

In [None]:
df = merged.copy()

# Drop rows with missing values in the core metrics
df = df.dropna(subset=["download_mbps", "min_rtt_ms", "ttfb_ms", "packet_loss_rate"])

# Build transformed features
df["inv_rtt"] = 1.0 / df["min_rtt_ms"]
df["inv_ttfb"] = 1.0 / df["ttfb_ms"]
df["loss"] = df["packet_loss_rate"]  # just rename for convenience

df[["download_mbps", "min_rtt_ms", "ttfb_ms", "loss"]].describe()


## 3. Fit α, β, γ via linear regression
Interpret throughput as approximately linear in 1/RTT, 1/TTFB, and loss.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

X = df[["inv_rtt", "inv_ttfb", "loss"]].values
y = df["download_mbps"].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

reg = LinearRegression()
reg.fit(X_scaled, y)

alpha_raw, beta_raw, gamma_raw = reg.coef_
intercept = reg.intercept_

alpha_raw, beta_raw, gamma_raw, intercept


In [None]:
alpha = abs(alpha_raw)
beta = abs(beta_raw)
gamma = abs(gamma_raw)  # we’ll subtract this later

# Optional: normalize so they sum to 1
s = alpha + beta + gamma
alpha, beta, gamma = alpha/s, beta/s, gamma/s

alpha, beta, gamma


In [None]:
df["score"] = (
    alpha * df["inv_rtt"] +
    beta  * df["inv_ttfb"] -
    gamma * df["loss"]
)
df[["score", "download_mbps"]].head()


4. Evaluate Score vs “Lowest RTT” baseline (real data)
We need a grouping key that represents “one client choosing among many servers”.
A reasonable guess with what you have is:
group_cols = ["date", "client_city"] or ["date", "client_asn"]
Pick whichever actually gives multiple rows per group.

In [None]:
# If needed, bring in identifiers from original M-Lab frame
id_cols = ["client_city", "client_asn", "server_site", "server_city"]
df = df.merge(
    mlab[id_cols + ["date", "region"]],
    on=["date", "region"],
    how="left"
)

group_cols = ["date", "client_city"]  # change if needed

df[group_cols].head()


In [None]:
def pick_lowest_rtt(g):
    return g.loc[g["min_rtt_ms"].idxmin()]

def pick_highest_score(g):
    return g.loc[g["score"].idxmax()]


In [None]:
baseline_rtt = df.groupby(group_cols, group_keys=False).apply(pick_lowest_rtt)
score_choice = df.groupby(group_cols, group_keys=False).apply(pick_highest_score)

baseline_rtt.shape, score_choice.shape


In [None]:
def summarize_throughput(series):
    return {
        "median": np.median(series),
        "p90": np.percentile(series, 90)
    }

base_stats = summarize_throughput(baseline_rtt["download_mbps"])
score_stats = summarize_throughput(score_choice["download_mbps"])

improvement_median = (score_stats["median"] - base_stats["median"]) / base_stats["median"] * 100
improvement_p90 = (score_stats["p90"] - base_stats["p90"]) / base_stats["p90"] * 100

base_stats, score_stats, improvement_median, improvement_p90


## 5. Evaluate on synthetic dataset
Your synthetic data file is at synthetic_cdn/synthetic_cdn_data.csv. I’ll assume columns:
scenario_id – each client/time scenario
server_id
rtt_ms, ttfb_ms, loss, throughput_mbps
Adjust names if different.

In [None]:
syn_path = ROOT / "synthetic_cdn" / "synthetic_cdn_data.csv"
syn = pd.read_csv(syn_path)

syn["inv_rtt"] = 1.0 / syn["rtt_ms"]
syn["inv_ttfb"] = 1.0 / syn["ttfb_ms"]
syn["loss"] = syn["loss"]

syn["score"] = (
    alpha * syn["inv_rtt"] +
    beta  * syn["inv_ttfb"] -
    gamma * syn["loss"]
)

group_cols_syn = ["scenario_id"]

baseline_rtt_syn = syn.groupby(group_cols_syn, group_keys=False).apply(
    lambda g: g.loc[g["rtt_ms"].idxmin()]
)

score_choice_syn = syn.groupby(group_cols_syn, group_keys=False).apply(
    lambda g: g.loc[g["score"].idxmax()]
)

base_stats_syn = summarize_throughput(baseline_rtt_syn["throughput_mbps"])
score_stats_syn = summarize_throughput(score_choice_syn["throughput_mbps"])

improvement_median_syn = (score_stats_syn["median"] - base_stats_syn["median"]) / base_stats_syn["median"] * 100
improvement_p90_syn = (score_stats_syn["p90"] - base_stats_syn["p90"]) / base_stats_syn["p90"] * 100

base_stats_syn, score_stats_syn, improvement_median_syn, improvement_p90_syn
