In [1]:
# Globals

# Measurement metadata we keep for each input measurement ID, from the RIPE ATLAS API GET call
KEEP_FIELDS = [
    'af',  # [4, 6] [Not for wifi] IPv4 of IPv6 Address family of the measurement
    'id',  # The unique identifier that RIPE Atlas assigned to this measurement
    'participant_count',  # Number of participating probes
    'probes',  # Probes involved in this measurement
    'target_asn',  # The number of the Autonomous System the IP address of the target belongs to
    'type',  # ["ping", "traceroute", "dns", "sslcert", "http", "ntp", "wifi"] The type of the measurement
]

# Name of the file where the data for all probes are saved
PROBES_DATA_CSV_NAME = '../data/probes_data.csv'

# Bias dimensions
BIAS_DIMENSIONS = [
    'RIR region', 
    'Location (country)', 
    'Location (continent)',
    'Customer cone (#ASNs)', 
    'Customer cone (#prefixes)',
    'Customer cone (#addresses)', 
    'AS hegemony',
    'Country influence (CTI origin)', 
    'Country influence (CTI top)',
    '#neighbors (total)', 
    '#neighbors (peers)', 
    '#neighbors (customers)',
    '#neighbors (providers)',
    '#IXPs (PeeringDB)',
    '#facilities (PeeringDB)', 
    'Peering policy (PeeringDB)', 
    'ASDB C1L1',
    'ASDB C1L2', 
    'Network type (PeeringDB)', 
    'Traffic ratio (PeeringDB)',
    'Traffic volume (PeeringDB)', 
    'Scope (PeeringDB)', 
    'Personal ASN'
]

In [34]:
# Module imports
import requests
import os
import warnings
import time
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from project_files.data_aggregation_tools import load_aggregated_dataframe
from project_files.map_probes import map_probes
from project_files.calculate_bias_for_list import calc_bias
# from globals import KEEP_FIELDS



In [3]:
# Input data and warnings setup

# C:\Users\thdia\Desktop\Thesis\ai4netmon-atlas-patterns\project_files\data_aggregation_tools.py:732
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
# C:\Users\thdia\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arraylike.py:396
warnings.filterwarnings("ignore", category=RuntimeWarning, message="divide by zero encountered in log")
# C:\Users\thdia\Desktop\Thesis\ai4netmon-atlas-patterns\project_files\bias_utils.py:355
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in divide")

def import_probes_data_df():
    if os.path.isfile(PROBES_DATA_CSV_NAME):
        return pd.read_csv(PROBES_DATA_CSV_NAME)
    else:
        raise FileNotFoundError('probes_data.csv does not exist in the /data/ directory.')
PROBES_DF = import_probes_data_df()

ASN_AGG_DF = load_aggregated_dataframe(preprocess=True)

input_meas_ids = [1018338, 1004340, 1017820, 1019139, 1017005, 1019222, 1007976, 1035173, 1010732, 1036065]

In [4]:
# Get url and reponse
def get_url(input_meas_ids):
    # Define base uri
    base_uri = 'https://atlas.ripe.net/api/v2'

    # Specify measurements IDs we want data for
    input_meas_ids_str = (',').join([str(x) for x in input_meas_ids])
    measurements_url = f'/measurements/?id__in={input_meas_ids_str}'

    # Specify which fields of the RIPE Atlas API response we want to keep
    fields = '&fields=' + (',').join(KEEP_FIELDS)

    # Construct final URI
    return base_uri + measurements_url + fields

def get_input_meas_data(input_meas_uri):
    return requests.get(input_meas_uri).json()

uri = get_url(input_meas_ids)
response = get_input_meas_data(uri)

In [5]:
response

{'count': 10,
 'next': None,
 'previous': None,
 'results': [{'af': 4,
   'id': 1004340,
   'participant_count': 15,
   'probes': [{'id': 5, 'url': 'https://atlas.ripe.net/api/v2/probes/5/'},
    {'id': 7, 'url': 'https://atlas.ripe.net/api/v2/probes/7/'},
    {'id': 17, 'url': 'https://atlas.ripe.net/api/v2/probes/17/'},
    {'id': 30, 'url': 'https://atlas.ripe.net/api/v2/probes/30/'},
    {'id': 32, 'url': 'https://atlas.ripe.net/api/v2/probes/32/'},
    {'id': 34, 'url': 'https://atlas.ripe.net/api/v2/probes/34/'},
    {'id': 74, 'url': 'https://atlas.ripe.net/api/v2/probes/74/'},
    {'id': 135, 'url': 'https://atlas.ripe.net/api/v2/probes/135/'},
    {'id': 181, 'url': 'https://atlas.ripe.net/api/v2/probes/181/'},
    {'id': 259, 'url': 'https://atlas.ripe.net/api/v2/probes/259/'},
    {'id': 418, 'url': 'https://atlas.ripe.net/api/v2/probes/418/'},
    {'id': 452, 'url': 'https://atlas.ripe.net/api/v2/probes/452/'},
    {'id': 517, 'url': 'https://atlas.ripe.net/api/v2/probes/51

In [6]:
# Create base dataframe containing measurement meta data
def get_asn_af(meas_af):
    v4, v6 = 0, 0
    if meas_af == 4:
        v4 = 1
    elif meas_af == 6:
        v6 = 1
    else:
        raise ValueError(f'"af" field can be one of 4 or 6, encountered value {meas_af}.')
    
    return v4, v6
    

def get_asns_column(row):
    meas_id = row['meas_id']
    meas_probes_list = row['probes']

    # If there are no probes in the measurement
    if not meas_probes_list:
        meas_asns = []
        print(f'{meas_id}: No probes found, continuing to the next measurement.')
    else:
        # Get the address family boolean for the measurement
        meas_af = row['af']
        v4, v6 = get_asn_af(meas_af)
        # Get the ASNs for the current measurement
        meas_asns, not_probes = map_probes(meas_probes_list, v4, v6, PROBES_DF)
        # If no ASNs are found
        if len(meas_asns) == 0:
            print(f'{meas_id}: No ASNs found, continuing to the next measurement.')
        else:
            print(f'{meas_id}: Found ASNs {meas_asns}.')

    return meas_asns

def clean_asns_col(df):
    no_asn_mask = df['num_asns'] == 0
    return df[~no_asn_mask], df[no_asn_mask]


# Create df with the data from the API
meas_data_df = pd.DataFrame(response['results']).rename(columns = {'participant_count': 'num_probes', 'id': 'meas_id'})
# Keep only probes ids for each probe entry in each measurement
meas_data_df['probes'] = meas_data_df['probes'].apply(lambda l: [probe['id'] for probe in l])
# Get the ASNs for each measurement based on the measurement's probes and af
meas_data_df['asns'] = meas_data_df.apply(get_asns_column, axis = 1)
meas_data_df['num_asns'] = meas_data_df['asns'].apply(lambda l: len(l))
meas_data_df, no_asns_df = clean_asns_col(meas_data_df)
meas_data_df

1004340: Found ASNs [3265, 33915, 3333, 20115, 20001, 6830, 4771, 15598, 44489, 13030, 199422, 3292, 28855].
1007976: No ASNs found, continuing to the next measurement.
1010732: Found ASNs [12479, 8283, 8708, 3209, 3333, 35100, 3320, 25376, 39647, 41000, 2856, 8356, 206238, 13030, 3352, 6805, 12586, 8412, 20712, 21396, 209097, 62353, 31424, 39608, 33915, 1136, 3320, 8437, 52030, 8365, 25148, 21502, 29518, 25180, 3215, 196803, 680, 3320, 202945, 1741, 3215, 20712, 62313, 34119, 31078, 2119, 50629, 31708, 35540, 31549, 12963, 5089, 37002, 3209, 12392, 3333, 6871, 3215, 2856, 3320, 8897, 12579, 6830, 20825, 8897, 2856, 3215, 201116, 9105, 56478, 8897, 59469, 197529, 20712, 24961, 31027, 5089, 29695, 8881, 3320, 3320, 16086, 3320, 199298, 16316, 5410, 6848, 3215, 3216, 15626, 3308, 12322, 6855, 8334, 3209, 3320, 196922, 3308, 3320, 20712, 33890, 8767, 39815, 8473, 39545, 12322, 35007, 33920, 34762, 12897, 9143, 8744, 5610, 15844, 3215, 8708, 3212, 6830, 197301, 13030, 43408, 43599, 3333, 3

Unnamed: 0,af,meas_id,num_probes,probes,target_asn,type,asns,num_asns
0,4,1004340,15,"[5, 7, 17, 30, 32, 34, 74, 135, 181, 259, 418,...",24940,ping,"[3265, 33915, 3333, 20115, 20001, 6830, 4771, ...",13
2,4,1010732,998,"[2001, 2002, 2003, 2007, 2009, 2010, 2011, 201...",29169,ping,"[12479, 8283, 8708, 3209, 3333, 35100, 3320, 2...",819
3,6,1017005,98,"[2002, 2014, 2015, 2035, 2052, 2070, 2071, 213...",20633,traceroute,"[8283, 25376, 8365, 1741, 197529, 3320, 12322,...",71
4,6,1017820,905,"[2002, 2009, 2010, 2014, 2015, 2016, 2017, 201...",5425,traceroute,"[8283, 3333, 35100, 25376, 41000, 2856, 206238...",601
5,6,1018338,905,"[2002, 2009, 2010, 2014, 2015, 2016, 2017, 201...",7014,traceroute,"[8283, 3333, 35100, 25376, 41000, 2856, 206238...",602
6,6,1019139,905,"[2002, 2009, 2010, 2014, 2015, 2016, 2017, 201...",237,traceroute,"[8283, 3333, 35100, 25376, 41000, 2856, 206238...",601
7,6,1019222,906,"[2002, 2009, 2010, 2014, 2015, 2016, 2017, 201...",20940,traceroute,"[8283, 3333, 35100, 25376, 41000, 2856, 206238...",601
8,6,1035173,20,"[76, 277, 319, 385, 425, 436, 443, 799, 801, 8...",2128,traceroute,"[3320, 195, 8190, 3320, 3265, 30764, 9790, 433...",14
9,4,1036065,4,"[2549, 4712, 10093, 10657]",13335,traceroute,"[7922, 8400, 20845, 36351]",4


In [7]:
# Calculate bias for each measurement for all bias dimensions
def generate_null_bias_data(meas_id):
    # Generate null data for each bias dimension
    data = {col: None for col in BIAS_DIMENSIONS}
    data['meas_id'] = meas_id
    bias_df = pd.DataFrame.from_dict(data, orient='index',).T#.set_index('meas_id')
    bias_df['meas_id'] = bias_df['meas_id'].astype('int64')
    bias_df.set_index('meas_id', inplace = True)
    return bias_df


def get_bias_columns(row):
    meas_id = row['meas_id']
    meas_asns = row['asns']
    if len(meas_asns) == 0:
        print(f'{meas_id}: This measurement contains no ASNs. Bias values will be set to NaN.')
        meas_bias_df = generate_null_bias_data(meas_id)
    else:
        try:
            meas_bias_df = calc_bias(meas_asns, 'probes', 0, ASN_AGG_DF).transpose().rename(index={'bias': meas_id}).rename_axis('meas_id')
            # Keep only the bias dimensions we have defined
            meas_bias_df = meas_bias_df[BIAS_DIMENSIONS]
        except KeyError:
            # Can happen if some asn id in meas_asns does not exist in ASN_AGG_DF. Skip that measurement.
            print(f'{meas_id}: This measurement contains ASNs that do not exist in ASN_AGG_DF. Bias values for this measurement will be set to NaN.')
            meas_bias_df = generate_null_bias_data(meas_id)
    
    return meas_bias_df

def get_bias_df(meas_data_df):
    bias_dfs = []
    for idx, row in meas_data_df.iterrows():
        meas_bias_df = get_bias_columns(row)
        bias_dfs.append(meas_bias_df)

    return pd.concat(bias_dfs)

bias_df = get_bias_df(meas_data_df)
bias_df

Unnamed: 0_level_0,RIR region,Location (country),Location (continent),Customer cone (#ASNs),Customer cone (#prefixes),Customer cone (#addresses),AS hegemony,Country influence (CTI origin),Country influence (CTI top),#neighbors (total),...,#IXPs (PeeringDB),#facilities (PeeringDB),Peering policy (PeeringDB),ASDB C1L1,ASDB C1L2,Network type (PeeringDB),Traffic ratio (PeeringDB),Traffic volume (PeeringDB),Scope (PeeringDB),Personal ASN
meas_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1004340,0.171873,0.642551,0.349104,0.329489,0.329047,0.626389,0.424183,0.514542,0.517537,0.788205,...,0.096716,0.135221,0.081611,0.270562,0.320717,0.120418,0.341297,0.348701,0.421807,0.007753
1010732,0.170377,0.317079,0.175232,0.220912,0.307882,0.37382,0.343543,0.244629,0.132864,0.265794,...,0.084243,0.127557,0.101861,0.13088,0.224312,0.090878,0.069868,0.158698,0.136943,0.000416
1017005,0.221816,0.522974,0.26234,0.202721,0.276674,0.472981,0.375286,0.241696,0.3449,0.311071,...,0.112102,0.115121,0.034328,0.249947,0.319293,0.122936,0.150213,0.098219,0.260686,0.000153
1017820,0.122544,0.316134,0.135383,0.239307,0.262459,0.342891,0.310569,0.210329,0.180628,0.304376,...,0.081053,0.137483,0.06515,0.116303,0.17237,0.108181,0.11798,0.136567,0.189984,0.000393
1018338,0.122005,0.31549,0.134773,0.23836,0.262329,0.342887,0.31037,0.208892,0.180297,0.306187,...,0.080729,0.135859,0.063052,0.11643,0.169082,0.104757,0.111933,0.137109,0.182132,0.000395
1019139,0.121818,0.315176,0.134567,0.239259,0.262359,0.342685,0.310229,0.209521,0.179838,0.304242,...,0.080452,0.137396,0.063302,0.114991,0.171998,0.108159,0.114872,0.136556,0.185933,0.000393
1019222,0.122179,0.315626,0.134972,0.239259,0.262361,0.342685,0.310229,0.209515,0.179838,0.304251,...,0.081176,0.137413,0.063364,0.114991,0.171998,0.108333,0.114822,0.136604,0.185972,0.000393
1035173,0.15177,0.602739,0.321128,0.285728,0.326182,0.427216,0.349024,0.438162,0.408601,0.279508,...,0.17155,0.232481,0.088309,0.273971,0.377928,0.215736,0.391985,0.476231,0.390844,0.007753
1036065,0.318194,0.688455,0.338364,0.919315,0.950857,0.972358,0.965467,0.980047,0.554059,0.918319,...,0.302899,0.741789,0.805042,0.401279,0.399478,0.372766,0.389288,0.921727,0.767227,0.007753


In [8]:
# Extract probes_df and asns_df and remove probes and asns columns from meas_data_df
def extract_list_column(df, col_name):
    """
    Eextracts the col_name list-type column into a new df by exploding it (together with)
    the measurement id, returns it and also removes the column from the initial dataframe.
    """
    # Extract list colunmn as a new dataframe
    col_list_df = df[col_name].explode().reset_index()
    # Drop list column from initial dataframe
    df = df.drop([col_name], axis = 1)
    print(f' Removed column {col_name} to a new df since it was a list column.')
    return df, col_list_df

def extract_list_columns(df):
    df_ = df.copy().set_index('meas_id')
    df, probes_df = extract_list_column(df_, 'probes')
    df, asns_df = extract_list_column(df, 'asns')

    probes_df = probes_df.rename(columns = {'probes': 'probe_id'}).astype(int)
    asns_df = asns_df.rename(columns = {'asns': 'asn'}).astype(int)

    return df.reset_index(), probes_df, asns_df

# Explode probes and asns columns to get a df containing probe/asn id and measurement id
meas_data_df, probes_df, asns_df = extract_list_columns(meas_data_df)

 Removed column probes to a new df since it was a list column.
 Removed column asns to a new df since it was a list column.


In [9]:
# Get final df by combining meas_data_df and bias_df
def get_final_df(meas_data_df, bias_df):
    return meas_data_df.set_index('meas_id').join(bias_df)

df = get_final_df(meas_data_df, bias_df)
df

Unnamed: 0_level_0,af,num_probes,target_asn,type,num_asns,RIR region,Location (country),Location (continent),Customer cone (#ASNs),Customer cone (#prefixes),...,#IXPs (PeeringDB),#facilities (PeeringDB),Peering policy (PeeringDB),ASDB C1L1,ASDB C1L2,Network type (PeeringDB),Traffic ratio (PeeringDB),Traffic volume (PeeringDB),Scope (PeeringDB),Personal ASN
meas_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1004340,4,15,24940,ping,13,0.171873,0.642551,0.349104,0.329489,0.329047,...,0.096716,0.135221,0.081611,0.270562,0.320717,0.120418,0.341297,0.348701,0.421807,0.007753
1010732,4,998,29169,ping,819,0.170377,0.317079,0.175232,0.220912,0.307882,...,0.084243,0.127557,0.101861,0.13088,0.224312,0.090878,0.069868,0.158698,0.136943,0.000416
1017005,6,98,20633,traceroute,71,0.221816,0.522974,0.26234,0.202721,0.276674,...,0.112102,0.115121,0.034328,0.249947,0.319293,0.122936,0.150213,0.098219,0.260686,0.000153
1017820,6,905,5425,traceroute,601,0.122544,0.316134,0.135383,0.239307,0.262459,...,0.081053,0.137483,0.06515,0.116303,0.17237,0.108181,0.11798,0.136567,0.189984,0.000393
1018338,6,905,7014,traceroute,602,0.122005,0.31549,0.134773,0.23836,0.262329,...,0.080729,0.135859,0.063052,0.11643,0.169082,0.104757,0.111933,0.137109,0.182132,0.000395
1019139,6,905,237,traceroute,601,0.121818,0.315176,0.134567,0.239259,0.262359,...,0.080452,0.137396,0.063302,0.114991,0.171998,0.108159,0.114872,0.136556,0.185933,0.000393
1019222,6,906,20940,traceroute,601,0.122179,0.315626,0.134972,0.239259,0.262361,...,0.081176,0.137413,0.063364,0.114991,0.171998,0.108333,0.114822,0.136604,0.185972,0.000393
1035173,6,20,2128,traceroute,14,0.15177,0.602739,0.321128,0.285728,0.326182,...,0.17155,0.232481,0.088309,0.273971,0.377928,0.215736,0.391985,0.476231,0.390844,0.007753
1036065,4,4,13335,traceroute,4,0.318194,0.688455,0.338364,0.919315,0.950857,...,0.302899,0.741789,0.805042,0.401279,0.399478,0.372766,0.389288,0.921727,0.767227,0.007753


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, 1004340 to 1036065
Data columns (total 28 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   af                              9 non-null      int64  
 1   num_probes                      9 non-null      int64  
 2   target_asn                      9 non-null      int64  
 3   type                            9 non-null      object 
 4   num_asns                        9 non-null      int64  
 5   RIR region                      9 non-null      float64
 6   Location (country)              9 non-null      float64
 7   Location (continent)            9 non-null      float64
 8   Customer cone (#ASNs)           9 non-null      float64
 9   Customer cone (#prefixes)       9 non-null      float64
 10  Customer cone (#addresses)      9 non-null      float64
 11  AS hegemony                     9 non-null      float64
 12  Country influence (CTI origin)  9

From here on, we are ready to start creating the plots we want. This includes:

* Figuring out the best way to represent the data so that we don't have to change a lot of things for each plot.

* Getting the code for the plots we want, namely:

    1) Top most frequent ASNs and probes (separate plots)

        * Need: Probes and ASes that appear in our measurements set -> `probes_df`, `asns_df`

    2) Histogram of number of probes and asns used per measurement (separate plots)

        * Need: Probes and ASes that appear in our measurements set. -> `probes_df`, `asns_df`

    3) Find avg number of probes from all input measurements (N = `avg_num_probes`). Then Average bias per dimension for a random set of N probes.

        * Need: Number of probes per measurement -> Probes that appear in our measuremnts set.
        * Need: AI4NetMon endpoint for finding average bias of measurements with X randomly sampled probes.

    4) Avg bias per dim

        * Need: - ASNs for each measurement in `input_meas_ids`.

    5) Scatter of number of probes vs Avg bias per measurement

        * Need: Number of probes per measurement, avg bias per measurement -> Probes and ASes that appear in our measurements set

    6) CDF of bias per Bias dimension.

        * Need: Bias per dimension for each measurement in `input_meas_ids`.

    7) Bias causes heatmap.

        * AI4NetMon Bias causes endpoint
        * ASNs per measurement for each measurement in `input_meas_ids`

## 1 - Top most frequent ASNs and probes

In [11]:
from plotly.subplots import make_subplots
import plotly.colors as pc
PLOTLY_DEFAULT_COLORS = pc.DEFAULT_PLOTLY_COLORS

In [12]:
def get_bar_trace(data_df, col, normalize_flag = False):
    # Plot distributions for probes and asns

    # Get counts
    plot_data = data_df[col].value_counts(normalize = normalize_flag).reset_index()

    if normalize_flag:
        trace = go.Bar(
            x = plot_data[col],
            y = plot_data['proportion'],
            showlegend=False
        )
        title = f'{plot_data.columns[0]} proportions'
    else:
        trace = go.Bar(
            x = plot_data[col],
            y = plot_data['count'],
            showlegend=False
        )
        title = f'{plot_data.columns[0]} counts'

    trace_dict = {
        'title': title,
        'traces': trace
    }

    return trace_dict

def get_reduced_bar_trace(data_df, col, show_lines = 1000, normalize_flag = False):
    # Plot distributions for probes and asns

    # Get counts
    plot_data = data_df[col].value_counts(normalize = normalize_flag).reset_index()
    top_plot_data = plot_data.iloc[:show_lines, :]
    
    if normalize_flag:
        trace = go.Bar(
            x = top_plot_data[col],
            y = top_plot_data['proportion'],
            showlegend=False
        )
        title = f'{plot_data.columns[0]} proportions (showing top {show_lines}/{plot_data.shape[0]})'
    else:
        trace = go.Bar(
            x = top_plot_data[col],
            y = top_plot_data['count'],
            showlegend=False
        )
        title = f'{plot_data.columns[0]} counts (showing top {show_lines}/{plot_data.shape[0]})'

    trace_dict = {
        'title': title,
        'traces': trace
    }

    return trace_dict

bar_traces = []
for dfp, col in zip([probes_df, asns_df], ['probe_id', 'asn']):
    dfp[col] = dfp[col].astype(str)
    total_bar_trace = get_bar_trace(dfp, col)
    bar_traces.append(total_bar_trace)

for dfp, col in zip([probes_df, asns_df], ['probe_id', 'asn']):
    bar_trace = get_reduced_bar_trace(dfp, col)
    bar_traces.append(bar_trace)

# Create subplots with the specified grid layout
rows = 2
cols = 2
fig = make_subplots(rows = rows, cols = cols,
                    subplot_titles = [d['title'] for d in bar_traces])

fig.append_trace(bar_traces[0]['traces'], row = 1, col = 1)
fig.append_trace(bar_traces[1]['traces'], row = 1, col = 2)
fig.append_trace(bar_traces[2]['traces'], row = 2, col = 1)
fig.append_trace(bar_traces[3]['traces'], row = 2, col = 2)

fig.update_traces(marker_color = PLOTLY_DEFAULT_COLORS[2])
fig.update_xaxes(visible = False, row = 1, col = 1)
fig.update_xaxes(visible = False, row = 1, col = 2)
fig.update_xaxes(visible = False, row = 2, col = 1)
fig.update_xaxes(visible = False, row = 2, col = 2)

# Update the layout
fig.update_layout(
    width = 1200,
    height = 700
)

fig.show()

## 2 - Histogram of number of probes and asns used per measurement (separate plots)

In [13]:
# Histogram 
fig = go.Figure(
    go.Bar(
        x = df.index.astype(str),
        y = df['num_probes'],
        showlegend=False
    )
)

fig.update_layout(
    xaxis_title = 'Measurement ID',
    yaxis_title = 'Number of probes',
    title_text = 'Number of probes per measurement',
    width = 1000,
    height = 500
)

fig.show()

In [14]:
# Histogram 
fig = go.Figure(
    go.Bar(
        x = df.index.astype(str),
        y = df['num_asns'],
        showlegend=False
    )
)

fig.update_layout(
    xaxis_title = 'Measurement ID',
    yaxis_title = 'Number of ASNs',
    title_text = 'Number of ASNs per measurement',
    width = 1000,
    height = 500
)

fig.show()

## 3- Median number of probes from all input measurements vs random sample 

Here we first find the average number of probes $\hat{N}$ from all input measurements and then compare the average bias value of our input measurement with the average bias of a set of $\hat{N}$ randomly selected probes.

NOTE: Is avg better than median? The distribution of num_probes will most likely be skewed.

In [15]:
def get_ai4netmon_atlas_random_sample_uri(N):
    return f'https://ai4netmon.csd.auth.gr/api/bias/randomAtlas/{N}'

def get_rand_avg_bias_per_dim(N):
    atlas_random_sample_uri = get_ai4netmon_atlas_random_sample_uri(N)
    response = requests.get(atlas_random_sample_uri).json()['Atlas'][f'{N}']
    return pd.Series(response)

def get_sample_avg_bias_per_dim(df):
    return df[BIAS_DIMENSIONS].reset_index(drop = True).mean()

def get_avg_bias(ser, ndec = 2):
    return round(ser.mean(), ndec)

In [16]:
def get_radar_trace(data, name, color):
    r = list(data.values)
    r = [*r, r[0]]
    theta = list(data.index)
    theta = [*theta, theta[0]]

    trace = go.Scatterpolar(
            r = r,
            theta = theta,
            name = name,
            line_color = color
    )

    return trace

def get_radar_traces(radar_data_dict):
    radar_traces = []
    for name, data_dict in radar_data_dict.items():
        data = data_dict['data']
        color = data_dict['color']
        trace = get_radar_trace(data, name, color)
        radar_traces.append(trace)
    
    return radar_traces

def plot_radar(radar_data_dict):
    radar_traces = get_radar_traces(radar_data_dict)
    
    fig = go.Figure()
    for trace in radar_traces:
        fig.add_trace(trace)

    fig.update_traces(opacity = 0.6, fill='toself')

    theta = list(radar_data_dict['Input']['data'].index)
    theta_labels = [x.replace(' (', '<br>(') for x in [*theta, theta[0]]]

    fig.update_layout(
            height = 500,
            width = 700,
            title = f'Average Bias Distribution for Input and Random sample',
            font = {
                'size': 10
            },
            polar={
                'radialaxis': {
                    'visible': True,
                    'range': [0, 1]
                },
                'angularaxis': {
                    'rotation': 90,
                    'ticktext': theta_labels
                }
            },
            
        )

    fig.show()

def get_bar_trace(x_vals, y_vals, colors = None):
    if colors is None:
        trace = go.Bar(
        x = x_vals,
        y = y_vals,
        marker_color = colors
    )
    else:
        trace = go.Bar(
            x = x_vals,
            y = y_vals,
            marker_color = colors
        )

    return trace

def prepare_bar_data(bar_data_dict):
    # Extract keys (x values) and values (y values) from the dictionary
    keys = list(avg_bias_data.keys())
    values = [data_dict['data'] for data_dict in avg_bias_data.values()]

    # Specify colors for each bar
    colors = PLOTLY_DEFAULT_COLORS[:len(keys)]

    return keys, values, colors

def plot_bar(bar_trace, fig_config):

    # Create the bar plot with custom colors
    fig = go.Figure(bar_trace)

    # Customize the layout if needed
    fig.update_layout(
        title = fig_config['title'],
        xaxis_title = fig_config['xaxis_title'],
        yaxis_title = fig_config['yaxis_title'],
        height = 500,
        width = 600,
    )

    # Show the plot
    fig.show()

In [17]:
# Average number of probes in input measurements
N = int(round(df['num_probes'].median(), 0))

rand_avg_bias_per_dim = get_rand_avg_bias_per_dim(N)
sample_avg_bias_per_dim = get_sample_avg_bias_per_dim(df)

avg_bias_per_dim_data = {
    'Input': {
        'data': sample_avg_bias_per_dim,
        'color': PLOTLY_DEFAULT_COLORS[0]
    },
    'Random': {
        'data': rand_avg_bias_per_dim,
        'color': PLOTLY_DEFAULT_COLORS[1]
    }
}

sample_avg_bias = get_avg_bias(sample_avg_bias_per_dim)
rand_avg_bias = get_avg_bias(rand_avg_bias_per_dim)

avg_bias_data = {
    'Input': {
        'data': sample_avg_bias,
        'color': PLOTLY_DEFAULT_COLORS[0]
    },
    'Random': {
        'data': rand_avg_bias,
        'color': PLOTLY_DEFAULT_COLORS[1]
    }
}


bar_data = prepare_bar_data(avg_bias_data)
bar_trace = get_bar_trace(*bar_data)

In [18]:
fig_config = {
    'title': "Average Bias per sample",
    'xaxis_title': "Sample",
    'yaxis_title': "Average Bias"
}
plot_bar(bar_trace, fig_config)

In [19]:
# Radar plot
plot_radar(avg_bias_per_dim_data)

## 4 - Average Bias per Dimension

In [20]:
bar_trace = go.Bar(
    x = sample_avg_bias_per_dim.sort_values().values,
    y = sample_avg_bias_per_dim.sort_values().index,
    orientation = 'h'
)

fig_config = {
    'title': 'Avg Bias per Dimension',
    'xaxis_title': 'Avg Bias',
    'yaxis_title': 'Dimension'
}
plot_bar(bar_trace, fig_config)

## 5 - Scatterplot of Number of probes vs Avg Bias per measurement

In [21]:


scatter_data = pd.concat([df['num_probes'], df[BIAS_DIMENSIONS].mean(axis = 1)], axis = 1).rename(columns = {0: 'avg_meas_bias'}).sort_values(by = 'num_probes').reset_index()
scatter_data['hovertext'] = 'Meas ID: ' + scatter_data['meas_id'].astype(str)


scatter_trace = go.Scatter(
    x = scatter_data['num_probes'],
    y = scatter_data['avg_meas_bias'],
    text = scatter_data['hovertext']
)

fig = go.Figure(scatter_trace)

fig.update_layout(
    title_text = 'Number of probes vs Average bias for each measurement',
    xaxis_title = 'Number of probes',
    yaxis_title = 'Average bias',
    width = 800,
    height = 500
)

fig.update_xaxes(type="log")

fig.show()

## 6- CDF of Bias per Bias dimension

In [22]:
from statsmodels.distributions.empirical_distribution import ECDF

In [23]:
def get_cdf(arr, showlegend = False):
    """
    Returns cdf plot of arr
    """
    
    cdf = ECDF(arr)

    trace = go.Scatter(
        x = cdf.x,
        y = cdf.y,
        mode = 'lines',
        line = dict(color = PLOTLY_DEFAULT_COLORS[0]),
        name = 'Sample',
        showlegend=showlegend
    )

    return trace

def get_ripe_atlas_bias_num_probes_vert_line(col, showlegend):
    
    num_probes_plot = [10, 100, 1000]
    ripe_atlas_num_probes_traces = []
    
    for j in range(len(num_probes_plot)):
        # Get number of probes value
        num_probes = num_probes_plot[j]
        # Create a mask for keeping only the current number of probes
        num_probes_mask = RIPE_ATLAS_BIAS_NUMPROBES_DF['num_probes'] == num_probes
        # df with 1 line (based on previous mask) and num_probes + bias dimensions as columns
        ripe_atlas_vertical_lines = RIPE_ATLAS_BIAS_NUMPROBES_DF[num_probes_mask]
        # Keep only the bias for the column currently being calculated
        x = ripe_atlas_vertical_lines[col].tolist()[0]
        # Get vertical line trace
        ripe_atlas_num_probes_vert_trace = go.Scatter(
                x=[x, x],
                y=[0, 1],
                mode='lines',
                name=f'RIPE Atlas {num_probes} probes',
                line = dict(color = PLOTLY_DEFAULT_COLORS[2 + j]),
                showlegend=showlegend
            )
        ripe_atlas_num_probes_traces.append(ripe_atlas_num_probes_vert_trace)

    return ripe_atlas_num_probes_traces

def get_cdf_traces(data_df):
    all_traces = []
    data_cols = list(data_df.columns)
    # Get plot data
    for i in range(data_df.shape[1]):
        trace_dict = {}
        plot_data = data_df.iloc[:, i].reset_index(drop = True)
        if i == 0:
            showlegend = True
        else:
            showlegend = False

        # Get cdf trace
        cdf_trace = get_cdf(plot_data, showlegend = showlegend)

        trace_dict['title'] = data_cols[i]
        trace_dict['traces'] = cdf_trace
        all_traces.append(trace_dict)
    
    return all_traces

def create_plot_grid(all_traces, fig_title, rows = 5, cols = 5, xrange = [0,1], yrange = [0,1], one_plot_per_subplot = True):

    # Create subplots with the specified grid layout
    fig = make_subplots(rows = rows, cols = cols,
                        subplot_titles = [d['title'] for d in all_traces])

    # Iterate through each row and column index to add histograms to the subplots
    for i in range(rows):
        for j in range(cols):
            row = i + 1
            col = j + 1
            index = i * cols + j
            if index < len(all_traces):
                # Get plot data
                
                traces = all_traces[index]['traces']
                if one_plot_per_subplot:
                    fig.append_trace(traces, row = row, col = col)
                else:
                    for trace in traces:
                        fig.append_trace(trace, row = row, col = col)
                fig.update_xaxes(range = xrange, row = row, col = col)
                fig.update_yaxes(range = yrange, row = row, col = col)

    # Update the annotations (subplot titles) font size
    title_font_size = 12  # Adjust the font size as desired
    for i in range(rows * cols):
        if i < len(all_traces):
            fig.update_annotations(font_size = title_font_size, 
                                   selector = dict(text=all_traces[i]['title'])
                                )
    # Update the layout
    fig.update_layout(
        title_text = fig_title,
        width = 1200,
        height = 1200
    )

    return fig
    

In [24]:
# Keep only bias dimensions
bias_df = df[BIAS_DIMENSIONS]
# Create a 5x5 grid of histograms
cdf_traces = get_cdf_traces(bias_df)
figure_title = "Bias Distribution across each Bias Dimension"
fig = create_plot_grid(cdf_traces, figure_title, one_plot_per_subplot=True)

# Show the plot
fig.show()

## Bias Causes

How should I do this? Should I aggregate all the bias causes of the input measurement set, or create a bias causes heatmap for each measurement? Or both? 

In [31]:
def get_bias_causes(asn_list):
    """
    Input:
        - asn_list (list of <int>): List of ASNs.

    Output:
        - bias_causes (dict): Contains bias causes for the bias corresponding to the asn_list sample.

    This function uses an ai4netmon endpoint that returns the bias causes for the sample of ASNs that corresponds to
    the asn_list input. More specifically, the response is the following form:
        bias_causes = {
            'Custom list': {
                '<bias_dim_1>': {
                    '<bin_11>': 'x11%', '<bin_12>': 'x12%',...
                },
                '<bias_dim_2>': {
                    '<bin_21>': 'x21%', '<bin_22>': 'x22%',...
                },
                ...
            },
            '#ASNs not found': <int>
        }
    where:
        > <bias_dim_i>: is any bias dimension where we have bias for the current sample of ASNs.
        > <bin_ij>: is the j-th bin of the bias dimension <bias_dim_i> where there is a mismatch in the
                    distribution of values between our sample and the set of all ASes in RIPE Atlas.
        > 'xij%' <str>: Percentage difference calculated as <all ASes> - <asn_list>

    For example:
        bias_causes = {
            'Customer cone (#ASNs)': {
                '1.0-3.0': '6.164%',
                '3.0-9.0': '-3.7264%',
                '9.0-26.0': '-1.3839%'
            }
            'Customer cone (#prefixes)': {
                '1.0-4.0': '28.6553%',
                '4.0-14.0': '-17.5897%'
            }
        }

    Here, this tells us that in the 'Customer cone (#ASNs)' dimension, for the bin values 1-3 there is a 6% difference
    between all ASes and our sample (asn_list), i.e. our sample has 6% less values in that bin for that dimension
    compared to all ASes. For the bin 3-9, where the value is negative, the interpretation is similar: our sample has
    3.7% more values in that bin compared to all ASes. Therefore, is the user wants to make the measurement containing
    these ASNs less biased, they should add 6% more values in the 1-3 bin and 3% less values in the 3-9 bin for the
    'Customer cone (#ASNs)' dimension.
    """
    base_url = 'https://ai4netmon.csd.auth.gr/api/bias/cause/asn/'
    asn_list_str = '?asn=' + '&asn='.join([str(asn) for asn in asn_list])
    url = base_url + asn_list_str
    bias_causes = requests.get(url).json()
    return bias_causes

In [35]:
for meas_id in input_meas_ids:
    print(f'Processing measurement with id {meas_id}.')
    meas_asns = asns_df[asns_df['meas_id'] == meas_id].loc[:, 'asn'].tolist()
    print(meas_asns)
    print(f"{meas_id}: Getting measurement's bias causes...")
    meas_bias_causes_start = time.time()
    meas_bias_causes = get_bias_causes(meas_asns)
    meas_bias_causes_end = time.time() - meas_bias_causes_start
    print(f'{meas_id}: Got bias causes: {meas_bias_causes}')
    print(f'{meas_id}: Time to get bias causes: {:.3fmeas_bias_causes_end}')
    print(meas_bias_causes)
    print('-'*100)

SyntaxError: f-string: empty expression not allowed (867864361.py, line 10)

## Open Issues

Open issues:

1) We utilize code that is not mine, how should that be handled?

2) The `PROBES_DF` dataframe can be loaded or created, saved and then used. How should that be handled?

3) Same as (3) for `ASN_AGG_DF`.

4) For each measurement, we bring only ASNs of type that correspond to the measurement's address family (`af`). We ignore measurements that:
    * Have no probes.
    * Have probes but no ASNs.
    * Have ASNs that are not in `ASN_AGG_DF`.

5) Refactor, refactor refactor.

6) Docstrings.