In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from skylark.utils import logger

import pickle
import requests
import json
import re
from pathlib import Path

data_dir = Path(globals()["_dh"][0]).parent / "data"
figure_dir = data_dir / "figures" / "get_transfer_costs"
figure_dir.mkdir(exist_ok=True, parents=True)

plt.style.use("seaborn-bright")
plt.set_cmap("plasma")

data_link = "https://b0.p.awsstatic.com/pricing/2.0/meteredUnitMaps/datatransfer/USD/current/datatransfer.json"
data = requests.get(data_link).json()

In [None]:
data["regions"].keys()

In [None]:
# get region names
region_name_map = {
    "Africa (Cape Town)": "af-south-1",
    "Asia Pacific (Hong Kong)": "ap-east-1",
    "Asia Pacific (Mumbai)": "ap-south-1",
    "Asia Pacific (Osaka)": "ap-northeast-3",
    "Asia Pacific (Seoul)": "ap-northeast-2",
    "Asia Pacific (Singapore)": "ap-southeast-1",
    "Asia Pacific (Sydney)": "ap-southeast-2",
    "Asia Pacific (Jakarta)": "ap-southeast-3",
    "Asia Pacific (Tokyo)": "ap-northeast-1",
    "AWS GovCloud (US-East)": "us-gov-east-1",
    "AWS GovCloud (US-West)": "us-gov-west-1",
    "Canada (Central)": "ca-central-1",
    "Europe (Frankfurt)": "eu-central-1",
    "Europe (Ireland)": "eu-west-1",
    "Europe (London)": "eu-west-2",
    "Europe (Milan)": "eu-south-1",
    "Europe (Paris)": "eu-west-3",
    "Europe (Stockholm)": "eu-north-1",
    "Middle East (Bahrain)": "me-south-1",
    "South America (São Paulo)": "sa-east-1",
    "US East (N. Virginia)": "us-east-1",
    "US East (Ohio)": "us-east-2",
    "US West (N. California)": "us-west-1",
    "US West (Oregon)": "us-west-2",
}
new_region_name_map_items = {}
for region_english, region_code in region_name_map.items():
    if region_english.startswith("Europe"):
        new_region_name_map_items[region_english.replace("Europe", "EU")] = region_code
        new_name = region_english.replace("Europe", "EU").replace("(", "").replace(")", "")
        new_region_name_map_items[new_name] = region_code
    if "(" in region_english:
        new_name = region_english.replace("(", "").replace(")", "")
        new_region_name_map_items[new_name] = region_code
region_name_map.update(new_region_name_map_items)
region_name_map["South America Sao Paulo"] = "sa-east-1"
region_name_map["South America (Sao Paulo)"] = "sa-east-1"
region_name_map["South America (São Paulo)"] = "sa-east-1"
region_name_map["South America São Paulo"] = "sa-east-1"
region_name_map["US East N. Virginia"] = "us-east-1"
region_name_map["US East N Virginia"] = "us-east-1"
region_name_map["US West N. California"] = "us-west-1"
region_name_map["US West N California"] = "us-west-1"

# parse json
cost_per_gb = []
unparsed_regions = []
for src_region, region_data in data["regions"].items():
    src = region_name_map.get(src_region)
    if src is None:
        logger.info(f"Missing region {src} ({src_region})")
        unparsed_regions.append(src_region)
    else:
        for dst_region, dst_region_data in region_data.items():
            cost = float(dst_region_data["price"])
            if dst_region.startswith("DataTransfer External Inbound"):
                assert cost == 0.0
            elif dst_region.startswith("DataTransfer External Outbound"):
                regex = re.compile(r"DataTransfer External Outbound (?P<volume>.*)")
                match = regex.search(dst_region)
                if match:
                    if match.group("volume") == "Next 10 TB":
                        cost_per_gb.append(dict(src=src, dst="internet", cost=cost))
                else:
                    logger.error(f"Could not parse {dst_region}")
            elif dst_region.startswith("DataTransfer InterRegion Outbound to"):
                regex = re.compile(r"DataTransfer InterRegion Outbound to (?P<dst_region>.*)")
                match = regex.search(dst_region)
                if match:
                    dst = region_name_map.get(match.group("dst_region"))
                    if dst is None:
                        unparsed_regions.append(match.group("dst_region"))
                    else:
                        cost_per_gb.append(dict(src=src, dst=dst, cost=cost))
                else:
                    logger.error(f"Could not parse {dst_region}")
            elif (
                dst_region.startswith("Cloudfrontless")
                or dst_region.startswith("DirectoryService")
                or dst_region.startswith("Backup")
                or dst_region.startswith("RDS")
                or dst_region.startswith("FSX")
            ):
                pass
            else:
                unparsed_regions.append(dst_region)
df = pd.DataFrame(cost_per_gb)
df.to_csv(data_dir / ".." / "profiles" / "aws_transfer_costs.csv", index=False)

In [None]:
df_gcp_regions = pd.read_csv(data_dir / ".." / "profiles" / "gcp_regions.csv")
gcp_region_map = dict(zip(df_gcp_regions["GCP name"], df_gcp_regions["GCP code"]))

df_gcp = pd.read_csv(data_dir / ".." / "profiles" / "gcp_raw_pricing_api.csv")
df_gcp = df_gcp[df_gcp["Service description"] == "Compute Engine"]
df_gcp = df_gcp[df_gcp["Product taxonomy"].str.startswith("GCP > Network > Egress")]
df_gcp = df_gcp[df_gcp["Product taxonomy"] != "GCP > Network > Egress > GCE > Premium > PD"]
df_gcp["Egress tier"] = df_gcp["Product taxonomy"].str.split(">").str[-1]
df_gcp[["SKU description", "Egress tier", "List price ($)", "Tiered usage start", "Unit description", "Per unit quantity"]]

out_rows = []
bw_tuples = []
mismatches = []
for row in df_gcp.iterrows():
    row = row[1]
    if "Internet" in row["SKU description"] and "from" in row["SKU description"] and "to" in row["SKU description"]:
        regex = re.compile(r"Network Internet(?P<tier>.*) Egress from (?P<region>.*) to (?P<dst_region>.*)")
        match = regex.search(row["SKU description"])
        if match:
            src = match.group("region")
            dst = match.group("dst_region")
            bw_tuples.append(
                dict(
                    src=src,
                    dst=dst,
                    cost=row["List price ($)"],
                    unit=row["Unit description"],
                    quantity=row["Per unit quantity"],
                    tier=row["Egress tier"],
                    tier_start=row["Tiered usage start"],
                )
            )
        else:
            logger.error(f'Could not parse {row["SKU description"]}')
    elif "Internet" in row["SKU description"] and "from" in row["SKU description"] and "to" not in row["SKU description"]:
        regex = re.compile(r"Network Internet(?P<tier>.*) Egress from (?P<region>.*)")
        match = regex.search(row["SKU description"])
        if match:
            src = match.group("region")
            dst = "internet"
            bw_tuples.append(
                dict(
                    src=src,
                    dst=dst,
                    cost=row["List price ($)"],
                    unit=row["Unit description"],
                    quantity=row["Per unit quantity"],
                    tier=row["Egress tier"],
                    tier_start=row["Tiered usage start"],
                )
            )
        else:
            logger.error(f'Could not parse {row["SKU description"]}')
    elif "from" in row["SKU description"] and "to" in row["SKU description"]:
        regex = re.compile(r".*from (?P<from>.*) to (?P<to>.*)")
        match = regex.search(row["SKU description"])
        if match:
            src = match.group("from")
            dst = match.group("to")
            bw_tuples.append(
                dict(
                    src=src,
                    dst=dst,
                    cost=row["List price ($)"],
                    unit=row["Unit description"],
                    quantity=row["Per unit quantity"],
                    tier=row["Egress tier"],
                    tier_start=row["Tiered usage start"],
                )
            )
        else:
            logger.error(f'Could not parse {row["SKU description"]}')
    else:
        out_rows.append(row)
df_gcp_bw = pd.DataFrame(bw_tuples)
df_gcp_out = pd.DataFrame(out_rows)
# df_gcp_bw.to_csv(data_dir / '..' / 'profiles' / 'gcp_bw_costs.csv', index=False)

In [None]:
# match src, dst to region names
unmatched_pairs = []
out_rows = []
print(gcp_region_map.keys())
for row in df_gcp_bw.iterrows():
    row = row[1]
    src = row["src"]
    dst = row["dst"]
    if src in gcp_region_map:
        row["src"] = gcp_region_map[src]
    else:
        unmatched_pairs.append((src, dst))
    if dst in gcp_region_map:
        row["dst"] = gcp_region_map[dst]
    elif dst == "internet":
        row["dst"] = "internet"
    else:
        unmatched_pairs.append((src, dst))
    out_rows.append(row)
df_gcp_bw = pd.DataFrame(out_rows)
df_gcp_bw = df_gcp_bw[df_gcp_bw["tier_start"] == 0.0]
df_gcp_bw.to_csv(data_dir / ".." / "profiles" / "gcp_bw_costs.csv", index=False)

In [None]:
set(unmatched_pairs)