In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from loguru import logger

import pickle
import requests
import json
import re
from pathlib import Path

data_dir = Path(globals()['_dh'][0]).parent / 'data'
figure_dir = data_dir / 'figures' / 'get_transfer_costs'
figure_dir.mkdir(exist_ok=True)

plt.style.use('seaborn-bright')
plt.set_cmap('plasma')

data_link = "https://b0.p.awsstatic.com/pricing/2.0/meteredUnitMaps/datatransfer/USD/current/datatransfer.json"
data = requests.get(data_link).json()

<Figure size 432x288 with 0 Axes>

In [4]:
data['regions'].keys()

dict_keys(['AWS GovCloud (US)', 'AWS GovCloud (US-East)', 'Africa (Cape Town)', 'Asia Pacific (Hong Kong)', 'Asia Pacific (KDDI) - Osaka', 'Asia Pacific (KDDI) - Tokyo', 'Asia Pacific (Mumbai)', 'Asia Pacific (Osaka)', 'Asia Pacific (SKT) - Daejeon', 'Asia Pacific (Seoul)', 'Asia Pacific (Singapore)', 'Asia Pacific (Sydney)', 'Asia Pacific (Tokyo)', 'Canada (Central)', 'EU (Frankfurt)', 'EU (Ireland)', 'EU (London)', 'EU (Milan)', 'EU (Paris)', 'EU (Stockholm)', 'EU (Vodafone) - Berlin', 'EU (Vodafone) - Dortmund', 'EU (Vodafone) - London', 'EU (Vodafone) - Munich', 'Middle East (Bahrain)', 'South America (Sao Paulo)', 'US East (Boston)', 'US East (Chicago)', 'US East (Dallas)', 'US East (Houston)', 'US East (Kansas City 2)', 'US East (Miami)', 'US East (Minneapolis)', 'US East (N. Virginia)', 'US East (New York City)', 'US East (Ohio)', 'US East (Philadelphia)', 'US East (Verizon) - Atlanta', 'US East (Verizon) - Boston', 'US East (Verizon) - Chicago', 'US East (Verizon) - Dallas', 'U

In [10]:
# get region names
region_name_map = {'Africa (Cape Town)': 'af-south-1', 'Asia Pacific (Hong Kong)': 'ap-east-1', 'Asia Pacific (Mumbai)': 'ap-south-1', 'Asia Pacific (Osaka)': 'ap-northeast-3', 'Asia Pacific (Seoul)': 'ap-northeast-2', 'Asia Pacific (Singapore)': 'ap-southeast-1', 'Asia Pacific (Sydney)': 'ap-southeast-2', 'Asia Pacific (Tokyo)': 'ap-northeast-1', 'AWS GovCloud (US-East)': 'us-gov-east-1', 'AWS GovCloud (US-West)': 'us-gov-west-1', 'Canada (Central)': 'ca-central-1', 'Europe (Frankfurt)': 'eu-central-1', 'Europe (Ireland)': 'eu-west-1', 'Europe (London)': 'eu-west-2', 'Europe (Milan)': 'eu-south-1', 'Europe (Paris)': 'eu-west-3', 'Europe (Stockholm)': 'eu-north-1', 'Middle East (Bahrain)': 'me-south-1', 'South America (São Paulo)': 'sa-east-1', 'US East (N. Virginia)': 'us-east-1', 'US East (Ohio)': 'us-east-2', 'US West (N. California)': 'us-west-1', 'US West (Oregon)': 'us-west-2'}
new_region_name_map_items = {}
for region_english, region_code in region_name_map.items():
    if region_english.startswith('Europe'):
        new_region_name_map_items[region_english.replace('Europe', 'EU')] = region_code
        new_name = region_english.replace('Europe', 'EU').replace('(', '').replace(')', '')
        new_region_name_map_items[new_name] = region_code
    if '(' in region_english:
        new_name = region_english.replace('(', '').replace(')', '')
        new_region_name_map_items[new_name] = region_code
region_name_map.update(new_region_name_map_items)
region_name_map['South America Sao Paulo'] = 'sa-east-1'
region_name_map['South America (Sao Paulo)'] = 'sa-east-1'
region_name_map['South America (São Paulo)'] = 'sa-east-1'
region_name_map['South America São Paulo'] = 'sa-east-1'
region_name_map['US East N. Virginia'] = 'us-east-1'
region_name_map['US East N Virginia'] = 'us-east-1'
region_name_map['US West N. California'] = 'us-west-1'
region_name_map['US West N California'] = 'us-west-1'

# parse json
cost_per_gb = []
unparsed_regions = []
for src_region, region_data in data['regions'].items():
    src = region_name_map.get(src_region)
    if src is None:
        logger.info(f"Missing region {src} ({src_region})")
        unparsed_regions.append(src_region)
    else:
        for dst_region, dst_region_data in region_data.items():
            cost = float(dst_region_data['price'])
            if dst_region.startswith('DataTransfer External Inbound'):
                assert cost == 0.
            elif dst_region.startswith('DataTransfer External Outbound'):
                regex = re.compile(r'DataTransfer External Outbound (?P<volume>.*)')
                match = regex.search(dst_region)
                if match:
                    if match.group('volume') == 'Next 10 TB':
                        cost_per_gb.append(dict(src=src, dst='internet', cost=cost))
                else:
                    logger.error(f'Could not parse {dst_region}')
            elif dst_region.startswith('DataTransfer InterRegion Outbound to'):
                regex = re.compile(r'DataTransfer InterRegion Outbound to (?P<dst_region>.*)')
                match = regex.search(dst_region)
                if match:
                    dst = region_name_map.get(match.group('dst_region'))
                    if dst is None:
                        unparsed_regions.append(match.group('dst_region'))
                    else:
                        cost_per_gb.append(dict(src=src, dst=dst, cost=cost))
                else:
                    logger.error(f'Could not parse {dst_region}')
            elif dst_region.startswith('Cloudfrontless') or dst_region.startswith('DirectoryService') or dst_region.startswith('Backup') or dst_region.startswith('RDS') or dst_region.startswith('FSX'):
                pass
            else:
                unparsed_regions.append(dst_region)
df = pd.DataFrame(cost_per_gb)
df.to_csv(data_dir / '..' / 'profiles' / 'aws_transfer_costs.csv', index=False)


2021-12-12 01:39:26.709 | INFO     | __main__:<module>:28 - Missing region None (AWS GovCloud (US))
2021-12-12 01:39:26.711 | INFO     | __main__:<module>:28 - Missing region None (Asia Pacific (KDDI) - Osaka)
2021-12-12 01:39:26.712 | INFO     | __main__:<module>:28 - Missing region None (Asia Pacific (KDDI) - Tokyo)
2021-12-12 01:39:26.714 | INFO     | __main__:<module>:28 - Missing region None (Asia Pacific (SKT) - Daejeon)
2021-12-12 01:39:26.718 | INFO     | __main__:<module>:28 - Missing region None (EU (Vodafone) - Berlin)
2021-12-12 01:39:26.719 | INFO     | __main__:<module>:28 - Missing region None (EU (Vodafone) - Dortmund)
2021-12-12 01:39:26.721 | INFO     | __main__:<module>:28 - Missing region None (EU (Vodafone) - London)
2021-12-12 01:39:26.726 | INFO     | __main__:<module>:28 - Missing region None (EU (Vodafone) - Munich)
2021-12-12 01:39:26.729 | INFO     | __main__:<module>:28 - Missing region None (US East (Boston))
2021-12-12 01:39:26.731 | INFO     | __main__:<m

In [144]:
df_gcp_regions = pd.read_csv(data_dir / '..' / 'profiles' / 'gcp_regions.csv')
gcp_region_map = dict(zip(df_gcp_regions['GCP name'], df_gcp_regions['GCP code']))

df_gcp = pd.read_csv(data_dir / '..' / 'profiles' / 'gcp_raw_pricing_api.csv')
df_gcp = df_gcp[df_gcp['Service description'] == 'Compute Engine']
df_gcp = df_gcp[df_gcp['Product taxonomy'].str.startswith('GCP > Network > Egress')]
df_gcp = df_gcp[df_gcp['Product taxonomy'] != 'GCP > Network > Egress > GCE > Premium > PD']
df_gcp['Egress tier'] = df_gcp['Product taxonomy'].str.split('>').str[-1]
df_gcp[['SKU description', 'Egress tier', 'List price ($)', 'Tiered usage start', 'Unit description', 'Per unit quantity']]

out_rows = []
bw_tuples = []
mismatches = []
for row in df_gcp.iterrows():
    row = row[1]
    if 'Internet' in row['SKU description'] and 'from' in row['SKU description'] and 'to' in row['SKU description']:
        regex = re.compile(r'Network Internet(?P<tier>.*) Egress from (?P<region>.*) to (?P<dst_region>.*)')
        match = regex.search(row['SKU description'])
        if match:
            src = match.group('region')
            dst = match.group('dst_region')
            bw_tuples.append(dict(src=src, dst=dst, cost=row['List price ($)'], unit=row['Unit description'], quantity=row['Per unit quantity'], tier=row['Egress tier'], tier_start=row['Tiered usage start']))
        else:
            logger.error(f'Could not parse {row["SKU description"]}')
    elif 'Internet' in row['SKU description'] and 'from' in row['SKU description'] and 'to' not in row['SKU description']:
        regex = re.compile(r'Network Internet(?P<tier>.*) Egress from (?P<region>.*)')
        match = regex.search(row['SKU description'])
        if match:
            src = match.group('region')
            dst = 'internet'
            bw_tuples.append(dict(src=src, dst=dst, cost=row['List price ($)'], unit=row['Unit description'], quantity=row['Per unit quantity'], tier=row['Egress tier'], tier_start=row['Tiered usage start']))
        else:
            logger.error(f'Could not parse {row["SKU description"]}')
    elif 'from' in row['SKU description'] and 'to' in row['SKU description']:
        regex = re.compile(r'.*from (?P<from>.*) to (?P<to>.*)')
        match = regex.search(row['SKU description'])
        if match:
            src = match.group('from')
            dst = match.group('to')
            bw_tuples.append(dict(src=src, dst=dst, cost=row['List price ($)'], unit=row['Unit description'], quantity=row['Per unit quantity'], tier=row['Egress tier'], tier_start=row['Tiered usage start']))
        else:
            logger.error(f'Could not parse {row["SKU description"]}')
    else:
        out_rows.append(row)
df_gcp_bw = pd.DataFrame(bw_tuples)
df_gcp_out = pd.DataFrame(out_rows)
# df_gcp_bw.to_csv(data_dir / '..' / 'profiles' / 'gcp_bw_costs.csv', index=False)

2021-12-11 20:37:11.331 | ERROR    | __main__:<module>:24 - Could not parse Network Internet Standard Tier Egress from Toronto
2021-12-11 20:37:11.332 | ERROR    | __main__:<module>:24 - Could not parse Network Internet Standard Tier Egress from Toronto
2021-12-11 20:37:11.333 | ERROR    | __main__:<module>:24 - Could not parse Network Internet Standard Tier Egress from Toronto


In [161]:
# match src, dst to region names
unmatched_pairs = []
out_rows = []
print(gcp_region_map.keys())
for row in df_gcp_bw.iterrows():
    row = row[1]
    src = row['src']
    dst = row['dst']
    if src in gcp_region_map:
        row['src'] = gcp_region_map[src]
    else:
        unmatched_pairs.append((src, dst))
    if dst in gcp_region_map:
        row['dst'] = gcp_region_map[dst]
    elif dst == 'internet':
        row['dst'] = 'internet'
    else:
        unmatched_pairs.append((src, dst))
    out_rows.append(row)
df_gcp_bw = pd.DataFrame(out_rows)
df_gcp_bw = df_gcp_bw[df_gcp_bw['tier_start'] == 0.]
df_gcp_bw.to_csv(data_dir / '..' / 'profiles' / 'gcp_bw_costs.csv', index=False)

dict_keys(['Belgium', 'Delhi', 'Finland', 'Frankfurt', 'Hong Kong', 'Iowa', 'Jakarta', 'Las Vegas', 'London', 'Los Angeles', 'Melbourne', 'Montreal', 'Mumbai', 'Netherlands', 'Oregon', 'Osaka', 'Salt Lake City', 'Sao Paulo', 'Seoul', 'Singapore', 'South Carolina', 'Sydney', 'Taiwan', 'Tokyo', 'Toronto', 'Virginia', 'Warsaw', 'Zurich'])


In [152]:
set(unmatched_pairs)

{('APAC', 'APAC'),
 ('APAC', 'Africa'),
 ('APAC', 'Americas'),
 ('APAC', 'Australia'),
 ('APAC', 'Central America'),
 ('APAC', 'China'),
 ('APAC', 'Delhi'),
 ('APAC', 'EMEA'),
 ('APAC', 'Eastern Europe'),
 ('APAC', 'Finland'),
 ('APAC', 'Frankfurt'),
 ('APAC', 'Hong Kong'),
 ('APAC', 'India'),
 ('APAC', 'Jakarta'),
 ('APAC', 'Japan'),
 ('APAC', 'Las Vegas'),
 ('APAC', 'London'),
 ('APAC', 'Los Angeles'),
 ('APAC', 'Melbourne'),
 ('APAC', 'Middle East'),
 ('APAC', 'Montreal'),
 ('APAC', 'Mumbai'),
 ('APAC', 'Netherlands'),
 ('APAC', 'Osaka'),
 ('APAC', 'Salt Lake City'),
 ('APAC', 'Sao Paulo'),
 ('APAC', 'Seoul'),
 ('APAC', 'Singapore'),
 ('APAC', 'South America'),
 ('APAC', 'Sydney'),
 ('APAC', 'Toronto'),
 ('APAC', 'Virginia'),
 ('APAC', 'Warsaw'),
 ('APAC', 'Western Europe'),
 ('APAC', 'Zurich'),
 ('Americas', 'APAC'),
 ('Americas', 'Africa'),
 ('Americas', 'Americas'),
 ('Americas', 'Australia'),
 ('Americas', 'Central America'),
 ('Americas', 'China'),
 ('Americas', 'Delhi'),
 ('Am