In [1]:
# Enables figures loading outside of browser.
# If not run, figures will load inline.
%matplotlib

import os
import math
import pandas as pd
import numpy as np
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import matplotlib.ticker
import datetime
import collections
import itertools

from scipy import stats

# Depends on: pip install sklearn
from sklearn.model_selection import train_test_split

# Some matplotlib features are version dependent.
assert(matplotlib.__version__ >= '2.1.2')

# Depends on: pip install --upgrade google-cloud-bigquery
from google.cloud import bigquery

def run_query(query, project='mlab-sandbox'):
    client = bigquery.Client(project=project)
    job = client.query(query)

    results = collections.defaultdict(list)
    for row in job.result(timeout=300):
        for key in row.keys():
            results[key].append(row.get(key))

    return pd.DataFrame(results)

def unlog(x, pos):
    """Formats the x axis for histograms taken on the log of values."""
    v = math.pow(10, x)
    frac, whole = math.modf(v)
    if frac > 0:
        return '%.1f' % v
    else:
        return '%d' % whole
    
    
def hist(vals, bin_count, log=True, cdf=False):
    """Produces hist or cdf values for smooth plots."""
    if log:
        r = [math.log10(x) for x in vals]
    else:
        r = vals
        
    m, bins = np.histogram(r, bin_count, normed=True)
    m = m.astype(float)

    tops = m
    if cdf:
        tops = np.cumsum(m)
        total = sum(m)
        tops = [float(t) / total for t in tops ]
    
    return tops, bins


logFormatter = matplotlib.ticker.FuncFormatter(unlog)

Using matplotlib backend: TkAgg




In [34]:
def plot_scatter(
    df, xname, yname,
    fig_by='', axes_by='', group_by='',
    figsize=(6,8), axes=(1,1),
    xlabel='', ylabel='',
    xlim=(), ylim=(),
    fx=list, fy=list,
    ylog=False, suptitle='', title='', legend={}):
    """
    Args:
        xname: str, name of column to use as x-axis.
        yname: str, name of column to use as y-axis.
        fig_by: str, name of column to split data into multiple figures.
        axes_by: str, name of column to arrange into a single panel.
        group_by: str, name of column to plot common split_by and group_by columns.
        figsize: (int, int), dimensions of figure.
        axes: (int, int), arrangement of axes within figure.
        xlabel: str, 
        ylabel: str, 
        fx: func,
        fy: func,
        xlim: (xmin, xmax),
        ylim: (ymin, ymax),
        ylog: bool,
        title: str,
        legend: **legend_args,
    """
    for f in sorted(set(['default'] if not fig_by else df[fig_by])):
        fig = plt.figure(figsize=figsize)
        ax = fig.subplots(axes[0], axes[1], squeeze=False)
        ax_index = list(itertools.product(range(axes[0]), range(axes[1])))
        
        df_fig = df if f == 'default' else df[df[fig_by] == f]
        for p, a in enumerate(sorted(set(['default'] if not axes_by else df_fig[axes_by]))):

            df_axes = df_fig if a == 'default' else df_fig[df_fig[axes_by] == a]
            i, j = ax_index[p]
            for g in sorted(set(['default'] if not group_by else df_axes[group_by])):
                df_g = df_axes if g == 'default' else df_axes[df_axes[group_by] == g]

                x = fx(df_g[xname])
                y = fy(df_g[yname])

                ax[i][j].scatter(x, y, s=1, label=g)

            if i != len(ax)-1:
                ax[i][j].set_xticklabels([])

            if title:
                ax[i][j].set_title(title.format(figure=f, axis=a, group=g))
            if ylabel:
                ax[i][j].set_ylabel(ylabel.format(figure=f, axis=a, group=g))
            if xlabel:
                ax[i][j].set_xlabel(xlabel.format(figure=f, axis=a, group=g))

            if xlim:
                ax[i][j].set_xlim(xlim)
            if ylim:
                ax[i][j].set_ylim(ylim)
            ax[i][j].tick_params(axis='x', labelrotation=-90)
            ax[i][j].grid(color='#dddddd')
            ax[i][j].legend(fontsize='x-small', **legend)
            if ylog:
                ax[i][j].semilogy()

        fig.suptitle(title.format(f))
        fig.tight_layout(rect=[0, 0.03, 1, 0.95])

In [3]:
def plot_hist(
    df, cname, bins=None,
    fig_by='', axes_by='', group_by='',
    figsize=(6,8), axes=(1,1),
    xlabel='', ylabel='',
    xlim=(), ylim=(),
    fy=list,
    xlog=False, suptitle='', title='', legend={}, cdf=False):
    """
    Args:
        cname: str, name of column to use as data source.
        fig_by: str, name of column to split data into multiple figures.
        axes_by: str, name of column to arrange into a single panel.
        group_by: str, name of column to plot common split_by and group_by columns.
        figsize: (int, int), dimensions of figure.
        axes: (int, int), arrangement of axes within figure.
        xlabel: str, 
        ylabel: str, 
        fx: func,
        fy: func,
        xlim: (xmin, xmax),
        ylim: (ymin, ymax),
        xlog: bool,
        title: str,
        suptitle: str,
        legend: **legend_args,
        cdf: bool,
    """
    for f in sorted(set(['default'] if not fig_by else df[fig_by])):
        fig = plt.figure(figsize=figsize)
        ax = fig.subplots(axes[0], axes[1], squeeze=False)
        ax_index = list(itertools.product(range(axes[0]), range(axes[1])))
        
        df_fig = df if f == 'default' else df[df[fig_by] == f]
        for p, a in enumerate(sorted(set(['default'] if not axes_by else df_fig[axes_by]))):

            df_axes = df_fig if a == 'default' else df_fig[df_fig[axes_by] == a]
            if p >= len(ax_index):
                print 'SKIPPING', p, f, a, 'too few axes positions'
                continue
                
            i, j = ax_index[p]
            for g in sorted(set(['default'] if not group_by else df_axes[group_by])):
                df_g = df_axes if g == 'default' else df_axes[df_axes[group_by] == g]

                r = df_g[cname]
                if bins is None:
                    size = int(math.sqrt(len(r)))
                else:
                    size = bins(r)
                h_tops, h_bins = hist(r, size, log=xlog , cdf=cdf)
                ax[i][j].plot(h_bins[:-1], h_tops, label=('%s-%s-%s' % (cname, a, g)))

            if i != len(ax)-1:
                ax[i][j].set_xticklabels([])

            if title:
                ax[i][j].set_title(title.format(figure=f, axis=a, group=g))
            if ylabel:
                ax[i][j].set_ylabel(ylabel.format(figure=f, axis=a, group=g))
            if xlabel:
                ax[i][j].set_xlabel(xlabel.format(figure=f, axis=a, group=g))

            if xlim:
                ax[i][j].set_xlim(xlim)
            if ylim:
                ax[i][j].set_ylim(ylim)

            ax[i][j].grid(color='#dddddd')
            ax[i][j].legend(fontsize='x-small', **legend)
            if xlog:
                ax[i][j].xaxis.set_major_formatter(logFormatter)


        if suptitle:
            fig.suptitle(suptitle.format(figure=f))
        fig.tight_layout(rect=[0, 0.03, 1, 0.95])
        

# UPLINK UTILIZATION OVER TIME

In [4]:
df_disco_pct = run_query("""
#standardSQL
SELECT

  REGEXP_EXTRACT(hostname, r'mlab1.([a-z]{3})[0-9]{2}.*') AS metro,
  REGEXP_EXTRACT(hostname, r'mlab1.([a-z]{3}[0-9]{2}).*') AS site,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,  
  APPROX_QUANTILES(value, 101)[ORDINAL(50)] as bytes_50th,
  APPROX_QUANTILES(value, 101)[ORDINAL(90)] as bytes_90th

FROM (
  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab1.[a-z]{3}[0-9]{2}).*') AS hostname,
    sample.timestamp AS sts,
    sample.value AS value
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
    metric LIKE 'switch.octets.uplink.tx'
    AND REGEXP_CONTAINS(hostname, r"mlab1.(dfw|lga|nuq)\d\d")
  GROUP BY
    hostname, metric, sts, value
)
WHERE
  hostname IS NOT NULL
GROUP BY
  hostname, day, ts
ORDER BY
  hostname, day, ts
""")

In [35]:
plot_scatter(
    df_disco_pct, 'ts', 'bytes_50th', axes_by='metro', group_by='site', axes=(3, 1),
    suptitle='Daily Median Uplink Utilization',
    ylabel="Median Uplink {axis}",
    xlim=(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01")),
    ylim=(1e4, 1e9),
    fx=lambda l: [pd.to_datetime(t, unit='s') for t in l],
    legend={'loc':3, 'ncol':7, 'columnspacing':1},
    ylog=True)

# Packets Over Time

In [6]:
df_disco_packets = run_query("""
#standardSQL

WITH measurementlab_switch_dedup AS (

  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
    sample.timestamp AS sts,
    sample.value AS value
    
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
    
  WHERE
    metric LIKE 'switch.unicast.uplink.tx'
    AND REGEXP_CONTAINS(hostname, r"mlab1.(dfw|lga|nuq)\d\d")
    
  GROUP BY
    hostname, metric, sts, value
)

SELECT
  REGEXP_EXTRACT(hostname, r'mlab1.([a-z]{3})[0-9]{2}.*') AS metro,
  REGEXP_EXTRACT(hostname, r'mlab1.([a-z]{3}[0-9]{2}).*') AS site,
  hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,
  SUM(value) AS total
  
FROM
  measurementlab_switch_dedup
  
WHERE
  hostname IS NOT NULL
  
GROUP BY
  hostname, day, ts
  
ORDER BY
  hostname, day, ts
""")

In [7]:
plot_scatter(
    df_disco_packets, 'ts', 'total', axes_by='metro', group_by='site', axes=(3, 1),
    suptitle='Daily Packets',
    ylabel="Total Packets {axis}",
    xlim=(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01")),
    ylim=(1e7, 1e10),
    fx=lambda l: [pd.to_datetime(t, unit='s') for t in l],
    legend={'loc':3, 'ncol':7, 'columnspacing':1},
    ylog=True)

dfw
lga
nuq


# DISCARDS OVER TIME

In [8]:
df_disco = run_query("""
#standardSQL

WITH measurementlab_switch_dedup AS (

  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
    sample.timestamp AS sts,
    sample.value AS value
    
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
    
  WHERE
    metric LIKE 'switch.discards.uplink.tx'
    AND REGEXP_CONTAINS(hostname, r"mlab1.(dfw|lga|nuq)\d\d")
    
  GROUP BY
    hostname, metric, sts, value
)

SELECT
  REGEXP_EXTRACT(hostname, r'mlab1.([a-z]{3})[0-9]{2}.*') AS metro,
  REGEXP_EXTRACT(hostname, r'mlab1.([a-z]{3}[0-9]{2}).*') AS site,
  hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,
  SUM(value) AS total_discards
  
FROM
  measurementlab_switch_dedup
  
WHERE
  hostname IS NOT NULL
  
GROUP BY
  hostname, day, ts
  
ORDER BY
  hostname, day, ts
""")

In [9]:
plot_scatter(
    df_disco, 'ts', 'total_discards', axes_by='metro', group_by='site', axes=(3, 1),
    suptitle='Daily Packet Discards',
    ylabel="Total Discards {axis}",
    xlim=(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01")),
    ylim=(1e2, 1e6),
    fx=lambda l: [pd.to_datetime(t, unit='s') for t in l],
    legend={'loc':2},
    ylog=True)

dfw
lga
nuq


# Daily DISCO discard ratios

In [10]:
df_disco_ratio = run_query("""
WITH measurementlab_switch_dedup AS (
  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
    sample.timestamp AS sts,
    sample.value AS value
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
    (metric LIKE 'switch.discards.uplink.tx' OR metric LIKE 'switch.unicast.uplink.tx')
    AND (hostname LIKE '%mlab1.lga%' OR hostname LIKE '%mlab1.dfw%' OR hostname LIKE '%mlab1.nuq%')
  GROUP BY
    hostname, metric, sts, value
)

SELECT
  REGEXP_EXTRACT(hostname, r'mlab1.([a-z]{3})[0-9]{2}.*') AS metro,
  REGEXP_EXTRACT(hostname, r'mlab1.([a-z]{3}[0-9]{2}).*') AS site,
  hostname,
  day,
  ts,
  IF(total > 0, discards / total, 0) as ratio
FROM (
SELECT
  hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,
  SUM(IF(metric = "switch.discards.uplink.tx", value, 0)) AS discards,
  SUM(IF(metric = "switch.unicast.uplink.tx", value, 0)) AS total
FROM
  measurementlab_switch_dedup
WHERE
  hostname IS NOT NULL
GROUP BY
  hostname, day, ts
HAVING
  discards < total
ORDER BY
  hostname, day, ts
)
GROUP BY
  hostname, day, ts, ratio
HAVING
  ratio < 0.01
ORDER BY
  hostname, day, ts
""")

In [36]:
plot_scatter(
    df_disco_ratio, 'ts', 'ratio', axes_by='metro', group_by='site', axes=(3, 1),
    suptitle='Daily Packet Loss Ratio',
    ylabel="Discard Ratio {axis}",
    xlim=(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01")),
    ylim=(1e-6, 1e-1),
    fx=lambda l: [pd.to_datetime(t, unit='s') for t in l],
    legend={'loc':2},
    ylog=True)

# NDT Median Download Rates

In [12]:
df_ndt_median = run_query("""
WITH mlab_ndt AS (
  SELECT
    connection_spec.server_hostname as server_hostname,
    log_time,
    (8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps

  FROM
  
    `measurement-lab.base_tables.ndt*`
  WHERE

        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
    AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
    AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
    AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
    AND connection_spec.data_direction = 1
    AND web100_log_entry.connection_spec.remote_ip != "45.56.98.222"
    AND web100_log_entry.connection_spec.remote_ip != "2600:3c03::f03c:91ff:fe33:819"
    AND web100_log_entry.connection_spec.remote_ip != "35.225.75.192"
    AND web100_log_entry.connection_spec.remote_ip != "35.192.37.249"
    AND web100_log_entry.connection_spec.remote_ip != "35.193.254.117"
    
  
  GROUP BY
    server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip,
    web100_log_entry.connection_spec.local_ip,
    web100_log_entry.connection_spec.remote_port,
    web100_log_entry.connection_spec.local_port,
    download_mbps
)
    
SELECT
  REGEXP_EXTRACT(server_hostname, r'mlab1.([a-z]{3})[0-9]{2}.*') AS metro,
  REGEXP_EXTRACT(server_hostname, r'mlab1.([a-z]{3}[0-9]{2}).*') AS site,
  REGEXP_EXTRACT(server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
  TIMESTAMP_TRUNC(log_time, DAY) as day,
  APPROX_QUANTILES(download_mbps, 101)[ORDINAL(50)] as download_mbps
  
FROM
  mlab_ndt

GROUP BY
  day,
  server_hostname

ORDER BY
  day
""")

In [13]:
plot_scatter(
    df_ndt_median, 'day', 'download_mbps', axes_by='metro', group_by='site', axes=(3, 1),
    suptitle='Median NDT Download Rates',
    ylabel="Mbps {axis}",
    xlim=(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01")),
    ylim=(1, 100),
    fx=lambda l: [pd.to_datetime(t) for t in l],
    legend={'loc':2},
    ylog=True)

dfw
lga
nuq


# NDT Segs Retrans

In [14]:
# NOT ENOUGH HISTORICAL NDT DATA TO GET FULL TIMELINE.

if False:
    df_ndt_retrans = run_query("""
#standardSQL

SELECT
  TIMESTAMP_TRUNC(log_time, DAY) AS day,
  REGEXP_EXTRACT(connection_spec.server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
  SAFE_DIVIDE(SUM(web100_log_entry.snap.SegsRetrans), SUM(web100_log_entry.snap.SegsOut)) AS median_ratio
FROM
  `measurement-lab.base_tables.ndt*`
WHERE
  web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq|ord|atl|lax)\d\d")
  AND connection_spec.data_direction = 1
  AND log_time >= TIMESTAMP("2016-06-01")
GROUP BY
  day,
  hostname
HAVING
  median_ratio is not NULL
ORDER BY
  day
""")

In [15]:
df_ndt_retrans = run_query("""
WITH mlab_ndt AS (
  SELECT
    connection_spec.server_hostname as hostname,
    web100_log_entry.connection_spec.remote_ip as remote_ip,
    log_time,
    web100_log_entry.snap.SegsRetrans as SegsRetrans,
    web100_log_entry.snap.SegsOut as SegsOut

  FROM
    `measurement-lab.base_tables.ndt*`

  WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"(lga|dfw|nuq)\d\d")
    AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
    AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
    AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
    AND connection_spec.data_direction = 1
    AND web100_log_entry.connection_spec.remote_ip != "45.56.98.222"
    AND web100_log_entry.connection_spec.remote_ip != "2600:3c03::f03c:91ff:fe33:819"
    AND web100_log_entry.connection_spec.remote_ip != "35.225.75.192"
    AND web100_log_entry.connection_spec.remote_ip != "35.192.37.249"
    AND web100_log_entry.connection_spec.remote_ip != "35.193.254.117"
    AND log_time >= TIMESTAMP("2016-06-01")
  
  GROUP BY
    connection_spec.server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip,
    web100_log_entry.connection_spec.local_ip,
    web100_log_entry.connection_spec.remote_port,
    web100_log_entry.connection_spec.local_port,
    SegsRetrans,
    SegsOut
)
    
SELECT
  REGEXP_EXTRACT(hostname, r"([a-z]{3})[0-9]{2}") as metro,
  REGEXP_EXTRACT(hostname, r"([a-z]{3}[0-9]{2})") as site,
  day,
  APPROX_QUANTILES(ratio, 101)[ORDINAL(50)] AS median_ratio,
  count(*) as count
FROM
(
  SELECT
    hostname,
    TIMESTAMP_TRUNC(log_time, DAY) as day,
    MAX(SAFE_DIVIDE(SegsRetrans, SegsOut)) as ratio

  FROM
    mlab_ndt

  GROUP BY
    hostname,
    day,
    remote_ip
)

GROUP BY
  hostname, day

ORDER BY
  day
""")

In [16]:
plot_scatter(
    df_ndt_retrans, 'day', 'median_ratio', axes_by='metro', group_by='site', axes=(3, 1),
    suptitle='Median NDT SegsRetran/SegsOut Ratio Rates',
    ylabel="Retransmission {axis}",
    xlim=(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01")),
    ylim=(1e-6, 1e-1),
    fx=lambda l: [pd.to_datetime(t) for t in l],
    legend={'loc':2},
    ylog=True)

dfw
lga
nuq


In [17]:
def box(x, y, text):
    plt.text(x, y, text,
        bbox=dict(boxstyle="round",
              ec=(.5, 0.5, 1., 0.25),
              fc=(.5, 0.8, 1., 0.25),
        )
    )

In [18]:
## COMBINED SegsRetrans & Switch Discards

sites = [
    ['dfw'],
    ['lga'],
    ['nuq'],
]

axes = [
    [None],
    [None],
    [None],
    [None],
    [None],
    [None],
]

print len(df_ndt_retrans)

prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

fig = plt.figure(figsize=(6, 8))

for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        axes[i][j] = plt.subplot2grid((3, 1), (i, j))
        axes[i][j].set_ylabel('Ratio ' + site.upper())
        if i != len(sites)-1:
            axes[i][j].set_xticklabels([])

        c = 0
        for s in sorted(set(df_ndt_retrans['site'])):
            if site in s:
                ds = df_ndt_retrans[ (df_ndt_retrans['site'] == s) ]
                d = [pd.to_datetime(t) for t in ds['day']]
                axes[i][j].scatter(d, ds['median_ratio'], s=1, label=s, c=colors[c])
                c += 1

        axes[i][j].set_ylim(1e-6, 1e-1)
        axes[i][j].set_xlim(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01"))
        axes[i][j].tick_params(axis='x', labelrotation=-90)
        axes[i][j].grid(color='#dddddd')
        axes[i][j].legend(loc=2, fontsize='x-small')
        axes[i][j].semilogy()

for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):       

        if i != len(sites)-1:
            axes[i][j].set_xticklabels([])
        c = 0
        for h in set(df_disco_ratio['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_disco_ratio[ (df_disco_ratio['hostname'] == h) ]
                d = [pd.to_datetime(t, unit='s') for t in ds['ts']]
                axes[i][j].scatter(d, ds['ratio'], s=1, label=h[6:11], c=colors[c])
                c += 1

box(pd.to_datetime("2016-10-30"), 5e-3, u"Segs Retransmit ↘")
box(pd.to_datetime("2016-10-30"), 9e-6, u"Switch Discards ↗")
                
fig.suptitle('Retrans & Switch Discard Rates')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

11762


# One Week Performance Distributions -- Before & After

In [19]:
df_disco_dist = run_query("""
SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    CASE
        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25") THEN 'before-2w'
        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-03-11") AND TIMESTAMP("2018-03-25") THEN 'after-2w'
        ELSE 'what'
    END as period,
    TIMESTAMP_TRUNC(sample.timestamp, DAY) AS day,
    UNIX_SECONDS(sample.timestamp) AS sts,
    8 * sample.value / 10 AS bps
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
       metric LIKE 'switch.octets.uplink.tx'
    AND (
        hostname LIKE '%mlab1.lga03%' OR hostname LIKE '%mlab1.dfw02%'
    )
    AND ( -- One week - Sunday to Saturday.
          TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-02-11") and TIMESTAMP("2018-02-25")
       OR TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-03-11") and TIMESTAMP("2018-03-25")
    )
  GROUP BY
    hostname, metric, sample.timestamp, bps
  ORDER BY
    hostname, sts
""")

print df_disco_dist.keys()
print len(df_disco_dist)


for h in set(df_disco_dist['name']):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
    for day in sorted(set(df_disco_dist['period'])):
        ds = df_disco_dist[ (df_disco_dist['name'] == h) & (df_disco_dist['period'] == day) ]
        r = [math.log10(x) for x in ds['bps']]
            
        axes.hist(r, # len(ds['bps']),
                      int(1.5 * math.sqrt(len(ds['bps']))),
                      histtype='step',
                      density=1, # cumulative=True,
                      label='cdf-'+h[6:11] + '-' + str(day), ls='-')

        axes.set_title(h[6:11])
        #axes.tick_params(axis='x', labelrotation=90)
    axes.grid(color='#dddddd')
    axes.legend(loc=2, fontsize='x-small')
    axes.xaxis.set_major_formatter(logFormatter)
    fig.subplots_adjust(hspace=0.3, wspace=0.4)
    fig.suptitle('Uplink Utilization Distribution')

In [20]:
plot_hist(
    df_disco_dist, 'bps', fig_by='name', group_by='period', suptitle='Uplink Utilization Distribution', title='{figure}',
    cdf=False, xlog=True, figsize=(12, 6))

# One Day Performance Distributions -- before & after

Harder to notice changes wrt before  after.

In [21]:
df_disco_dist4 = run_query("""
SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    TIMESTAMP_TRUNC(sample.timestamp, DAY) AS day,
    UNIX_SECONDS(sample.timestamp) AS sts,
    8 * sample.value / 10 AS bps
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
       metric LIKE 'switch.octets.uplink.tx'
    AND (
        hostname LIKE '%mlab1.lga03%' OR hostname LIKE '%mlab1.dfw02%'
    )
    AND (
        TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-02-18")
     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-02-25")
     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-04")
     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-11")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-04")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-05")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-10")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-01")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-01")
    )
  GROUP BY
    hostname, metric, sample.timestamp, bps
  ORDER BY
    hostname, sts
""")

print df_disco_dist4.keys()
print len(df_disco_dist4)

for h in set(df_disco_dist4['name']):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
    for day in sorted(set(df_disco_dist4['day'])):
        ds = df_disco_dist4[ (df_disco_dist4['name'] == h) & (df_disco_dist4['day'] == day) ]
        r = [math.log10(x) for x in ds['bps']]
            
        axes.hist(r, # len(ds['bps']),
                      int(1.5 * math.sqrt(len(ds['bps']))),
                      histtype='step',
                      density=1, # cumulative=True,
                      label='cdf-'+h[6:11] + '-' + str(day), ls='-')

        axes.set_title(h[6:11])
        #axes.tick_params(axis='x', labelrotation=90)
    axes.grid(color='#dddddd')
    axes.legend(loc=2, fontsize='x-small')
    axes.xaxis.set_major_formatter(logFormatter)
    fig.subplots_adjust(hspace=0.3, wspace=0.4)
    fig.suptitle('Uplink Utilization Distribution')

In [22]:
plot_hist(
    df_disco_dist4, 'bps', lambda r: int(1.5 * math.sqrt(len(r))),
    fig_by='name', group_by='day', suptitle='Uplink Utilization Distribution',
    title='{figure}',
    cdf=False, xlog=True, figsize=(12, 6))

# NDT test distributions - Before & After

In [23]:
df_ndt_dist = run_query("""
WITH mlab_ndt AS (
  SELECT
    connection_spec.server_hostname as server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip AS remote_ip,
    (8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps
  FROM
    `measurement-lab.base_tables.ndt*`
  WHERE

  (    TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") AND TIMESTAMP("2018-03-25"))
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
  AND connection_spec.data_direction = 1
  
  GROUP BY
    server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip,
    web100_log_entry.connection_spec.local_ip,
    web100_log_entry.connection_spec.remote_port,
    web100_log_entry.connection_spec.local_port,
    download_mbps)
    
SELECT
  REGEXP_EXTRACT(server_hostname, r'mlab1.([a-z]{3})[0-9]{2}.*') AS metro,
  REGEXP_EXTRACT(server_hostname, r'mlab1.([a-z]{3}[0-9]{2}).*') AS site,
  REGEXP_EXTRACT(server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
  CASE
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25") THEN 'before-2w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") AND TIMESTAMP("2018-03-25") THEN 'after-2w'
    ELSE 'what'
  END AS period,
  --web100_log_entry.connection_spec.remote_ip AS remote_ip,
  --(8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps
  MAX(download_mbps) AS download_mbps
  --APPROX_QUANTILES(download_mbps, 101)[ORDINAL(50)] as download_mbps

FROM
  mlab_ndt
WHERE
  remote_ip IN(
    SELECT
      remote_ip
    FROM (
      SELECT
        remote_ip, count(*) as c1
      FROM
        mlab_ndt
      WHERE
        TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
      GROUP BY
        remote_ip
      HAVING c1 > 10
    ) INNER JOIN (
      SELECT
        remote_ip AS remote_ip, count(*) as c2
      FROM
        mlab_ndt
      WHERE
        TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") AND TIMESTAMP("2018-03-25")
      GROUP BY
        remote_ip
      HAVING c2 > 10
    ) USING (remote_ip)) 
GROUP BY
  server_hostname,
  period,
  remote_ip
  --download_mbps
""")

print df_ndt_dist.keys()
print len(df_ndt_dist)

def hist(vals, bin_count, log=True, cdf=False):
    """Produces hist or cdf values for smooth plots."""
    if log:
        r = [math.log10(x) for x in vals]
    else:
        r = vals
        
    m, bins = np.histogram(r, bin_count, normed=True)
    m = m.astype(float)

    tops = m
    if cdf:
        tops = np.cumsum(m)
        total = sum(m)
        tops = [float(t) / total for t in tops ]
    
    return tops, bins

seq = [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)]

for site in set([v[6:9] for v in set(df_ndt_dist['name'])]):
    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 6))
    for p, h in enumerate(sorted([h for h in set(df_ndt_dist['name']) if site in h])):
        before = None
        r_before = None

        for day in ['before-2w', 'after-2w']:
            ds = df_ndt_dist[ (df_ndt_dist['name'] == h) & (df_ndt_dist['period'] == day) ]
            r = ds['download_mbps']
            #print h, len(r)
            if not len(r):
                continue

            size = int(math.sqrt(len(r)))
            
            if day == 'after-2w':
                size = before                 # Test before vs after
                result = stats.ks_2samp(r, r_before)
                #if result.pvalue < 0.01:     # print 'diff', h, result  # Test itself.
                a, b = train_test_split(r, test_size=0.5)
                result = stats.ks_2samp(a, b)
                #if result.pvalue < 0.01:
                    #print 'same', h, result
                    #print '================================='

            else:
                before = size
                r_before = r
            
                
            #tops, bins = hist(r, int(1.8 * math.sqrt(len(r))), log=True , cdf=True)
            #tops, bins = hist(r, int(math.sqrt(len(r))), log=True , cdf=True)
            #print size, h, day
            #tops, bins = hist(r, size, log=True , cdf=True)
            tops, bins = hist(r, size, log=True , cdf=True)
            #tops, bins = hist(r, int(1.8 * math.sqrt(len(r))), log=False , cdf=True)           
            #tops, bins = hist(r, len(r), log=False , cdf=True)            
            

            #tops_a, bins_a = hist(a, int(1 * math.sqrt(len(a))), log=True, cdf=True)
            #tops_b, bins_b = hist(b, int(1 * math.sqrt(len(b))), log=True, cdf=True)
            if p > len(seq)-1:
                print 'skipping', h
                continue
            i, j = seq[p]
            #print h, len(bins), len(tops)
            axes[i, j].plot(bins[:-1], tops, label='cdf-'+h[6:11] + '-' + str(day))
            #axes[i, j].plot(bins_a[:-1], tops_a, label=h[6:11] + '-' + str(day)+'-a')
            #axes[i, j].plot(bins_b[:-1], tops_b, label=h[6:11] + '-' + str(day)+'-b')
            axes[i, j].set_title(h[6:11])
            #axes[i, j].set_xlim(-10, 1000)
            #axes[i, j].set_xlim(math.log10(.25), math.log10(1000))
            axes[i, j].set_xlim(math.log10(.1), math.log10(1000))
            axes[i, j].grid(color='#dddddd')
            axes[i, j].legend(loc=2, fontsize='x-small')
            #axes[i, j].set_ylim(-0.1, 1.1)
            axes[i, j].xaxis.set_major_formatter(logFormatter)
    fig.subplots_adjust(hspace=0.3, wspace=0.4)
    fig.suptitle('NDT Download Distributions')

In [24]:
plot_hist(
    df_ndt_dist, 'download_mbps', lambda r: int(math.sqrt(len(r))),
    fig_by='metro',
    axes_by='name', group_by='period', suptitle='NDT Download Distributions',
    title='{axis}', axes=(3, 2),
    xlim=(math.log10(.1), math.log10(1000)),
    cdf=True, xlog=True, figsize=(12, 6))

In [25]:
df_ndt_variance = run_query("""
WITH mlab_ndt AS (
  SELECT
    connection_spec.server_hostname as server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip AS remote_ip,
    (8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps
  FROM
    `measurement-lab.base_tables.ndt*`
  WHERE

  (    TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") AND TIMESTAMP("2018-03-25"))
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
  AND connection_spec.data_direction = 1
  
  GROUP BY
    server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip,
    web100_log_entry.connection_spec.local_ip,
    web100_log_entry.connection_spec.remote_port,
    web100_log_entry.connection_spec.local_port,
    download_mbps)


SELECT
  REGEXP_EXTRACT(server_hostname, r'mlab[1-4].([a-z]{3})[0-9]{2}.*') AS metro,
  REGEXP_EXTRACT(server_hostname, r'mlab[1-4].([a-z]{3}[0-9]{2}).*') AS site,
  REGEXP_EXTRACT(server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
  CASE
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25") THEN 'before-2w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") AND TIMESTAMP("2018-03-25") THEN 'after-2w'
    ELSE 'what'
  END AS period,
  remote_ip,
  STDDEV(download_mbps) AS download_stddev

FROM
  mlab_ndt
WHERE
  remote_ip IN(
    SELECT
      remote_ip
    FROM (
      SELECT
        remote_ip, count(*) as c1
      FROM
        mlab_ndt
      WHERE
        TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
      GROUP BY
        remote_ip
      HAVING c1 > 5
    ) INNER JOIN (
      SELECT
        remote_ip AS remote_ip, count(*) as c2
      FROM
        mlab_ndt
      WHERE
        TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") AND TIMESTAMP("2018-03-25")
      GROUP BY
        remote_ip
      HAVING c2 > 5
    ) USING (remote_ip)) 
GROUP BY
  server_hostname,
  period,
  remote_ip
  --download_mbps

HAVING download_stddev is not NULL
""")

In [26]:
print len(df_ndt_variance)

63423


In [27]:
# NOTE: does not preserve binsize across group_by. Each line re-calculates the bin size.
plot_hist(
    df_ndt_variance, 'download_stddev', lambda r: int(math.sqrt(len(r))),
    fig_by='metro',
    axes_by='site', group_by='period', suptitle='Distribution of Stddev of NDT Downloads per remote_ip',
    title='{axis}', axes=(3, 2),
    xlim=(math.log10(.01), math.log10(1000)),
    cdf=False, xlog=True, figsize=(12, 6))

seq = [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)]

skip = 'mlab1.lga02'
for site in set([v[6:9] for v in set(df_ndt_variance['hostname'])]):
    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 6))
    for p, h in enumerate(sorted([h for h in set(df_ndt_variance['hostname']) if site in h])):
        before = None
        for day in ['before-2w', 'after-2w']:
            ds = df_ndt_variance[ (df_ndt_variance['hostname'] == h) & (df_ndt_variance['period'] == day) ]
            r = ds['download_stddev']
            print h, len(r)
            if not len(r) or h == skip:
                continue

            size = int(math.sqrt(len(r)))
            if day == 'after-2w':
                size = before
            else:
                before = size

            tops, bins = hist(r, size, log=True, cdf=False)
            if p > len(seq)-1:
                print 'skipping', h
                continue
            i, j = seq[p]

            axes[i, j].plot(bins[:-1], tops, label='cdf-'+h[6:11] + '-' + str(day))
    
            axes[i, j].set_title(h[6:11])
            #axes[i, j].set_xlim(-10, 1000)
            #axes[i, j].set_xlim(math.log10(.25), math.log10(1000))
            axes[i, j].set_xlim(math.log10(.01), math.log10(1000))
            axes[i, j].grid(color='#dddddd')
            axes[i, j].legend(loc=2, fontsize='x-small')
            #axes[i, j].set_ylim(-0.1, 1.1)
            axes[i, j].xaxis.set_major_formatter(logFormatter)
    fig.subplots_adjust(hspace=0.3, wspace=0.4)
    fig.suptitle('Distribution of Stddev of NDT Downloads per remote_ip')

In [28]:
df_ndt_all = run_query("""
WITH mlab_ndt AS (
  SELECT
    REGEXP_EXTRACT(connection_spec.server_hostname, r"([a-z]{3})[0-9]{2}") as metro,
    REGEXP_EXTRACT(connection_spec.server_hostname, r"([a-z]{3}[0-9]{2})") as site,
    web100_log_entry.connection_spec.remote_ip as remote_ip,
    log_time,
    (8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps

  FROM
  
    `measurement-lab.base_tables.ndt*`
  WHERE

        REGEXP_CONTAINS(connection_spec.server_hostname, r"(lga|dfw|nuq)\d\d")
    AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
    AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
    AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
    AND connection_spec.data_direction = 1
    AND web100_log_entry.connection_spec.remote_ip != "45.56.98.222"
    AND web100_log_entry.connection_spec.remote_ip != "2600:3c03::f03c:91ff:fe33:819"
    AND web100_log_entry.connection_spec.remote_ip != "35.225.75.192"
    AND web100_log_entry.connection_spec.remote_ip != "35.192.37.249"
    AND web100_log_entry.connection_spec.remote_ip != "35.193.254.117"
    AND anomalies.no_meta is not true
    
  
  GROUP BY
    connection_spec.server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip,
    web100_log_entry.connection_spec.local_ip,
    web100_log_entry.connection_spec.remote_port,
    web100_log_entry.connection_spec.local_port,
    download_mbps
)
    
SELECT
  metro,
  site,
  day,
 --  AVG(download_mbps) as download_mbps,
  APPROX_QUANTILES(download_mbps, 101)[ORDINAL(50)] as download_mbps,
  count(*) as count
FROM
(
  SELECT
    metro,
    site,
    TIMESTAMP_TRUNC(log_time, DAY) as day,
    -- APPROX_QUANTILES(download_mbps, 101)[ORDINAL(50)] as download_mbps
    MAX(download_mbps) as download_mbps
  FROM
    mlab_ndt

  GROUP BY
    metro, site, day, remote_ip
)

GROUP BY
  metro, site, day

ORDER BY
  day
""")

In [29]:
plot_scatter(
    df_ndt_all, 'day', 'download_mbps', axes_by='metro', group_by='site', axes=(3, 1),
    suptitle='Median NDT Download Rates',
    ylabel="Mbps {axis}",
    xlim=(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01")),
    ylim=(0, 50),
    fx=lambda l: [pd.to_datetime(t) for t in l],
    legend={'loc':2})

dfw
lga
nuq
