In [131]:
# Enables figures loading outside of browser.
# If not run, figures will load inline.
%matplotlib

Using matplotlib backend: MacOSX


In [132]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import matplotlib.ticker

# Some matplotlib features are version dependent.
assert(matplotlib.__version__ >= '2.1.2')

# Depends on: pip install --upgrade google-cloud-bigquery
import query

In [134]:
def unlog(x, pos):
    return '%.2f' % math.pow(10, x)

customFormatter = matplotlib.ticker.FuncFormatter(unlog)

In [160]:
result = query.sync_query("""
#standardSQL
SELECT
  name AS hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,
  SUM(IF(metric = 'switch.discards.uplink.tx', value, 0)) AS total_discards,
  SUM(IF(metric = 'switch.unicast.uplink.tx', value, 0)) AS total_packets,
  COUNTIF(metric = 'switch.discards.uplink.tx' AND value > 0) / 8640 AS pct_discards

FROM (
  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    sample.timestamp AS sts,
    sample.value AS value
  FROM
    `mlab-sandbox.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
    metric LIKE 'switch.discards.uplink.tx' OR metric LIKE 'switch.unicast.uplink.tx'
  GROUP BY
    hostname, metric, sts, value
)
WHERE
  name IS NOT NULL
GROUP BY
  hostname, day, ts
ORDER BY
  hostname, day, ts
""")

df_disco = pd.DataFrame(result)

# Discards over time

In [163]:
sites = [
    ['dfw', 'lga', 'iad'],
    ['sea', 'atl', 'den'],
    ['mia', 'nuq', 'ord'],
]

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 10))
for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        if j != 0:
            axes[i, j].set_yticklabels([])
        if i != len(sites)-1:
            axes[i, j].set_xticklabels([])
        for h in set(df_disco['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_disco[ (df_disco['hostname'] == h) & (df_disco['total_discards'] > 100)& (df_disco['total_discards'] < 1000000)]
                axes[i, j].plot_date(dates.epoch2num(ds['ts']), ds['total_discards'], ls='-', ms=0, label=h[6:11])

        axes[i, j].set_title(site)
        axes[i, j].set_ylim(100, 1000000)
        axes[i, j].tick_params(axis='x', labelrotation=90)
        axes[i, j].grid(color='#dddddd')
        axes[i, j].legend(loc=4, fontsize='x-small')
        axes[i, j].semilogy()
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle('Discards over time')

Text(0.5,0.98,u'Discards over time')

# Percent of Timebins with Discards 

In [164]:
title = 'Daily percentage of timebins with any discards'
sites = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04'],
]

fig, axes = plt.subplots(nrows=1, ncols=len(sites[0]))
for i, hosts in enumerate(sites):
    for j, host in enumerate(hosts): 
        ax = axes[j]
        
        ds = df_disco[ df_disco['hostname'] == host ]
        ax.plot_date(dates.epoch2num(ds['ts']), ds['pct_discards'], ls='-', ms=0, label=host)
        
        ax.set_title(host)
        ax.set_ylim(-0.01, .4)
        ax.tick_params(axis='x', labelrotation=90)
        ax.grid(color='#dddddd')
        ax.legend(loc=4, fontsize='x-small')
        
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle(title)

Text(0.5,0.98,u'Daily percentage of timebins with any discards')

# Total Packet Discard Ratios (Switch Loss Rate)

In [165]:
sites = [
    ['dfw', 'lga', 'iad'],
    ['sea', 'atl', 'den'],
    ['mia', 'nuq', 'ord'],
]

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 10))
#fig, axes = plt.subplots(nrows=2, ncols=2)
for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        axes[i, j].set_title(site)
        if j != 0:
            axes[i, j].set_yticklabels([])
        if i != len(sites)-1:
            axes[i, j].set_xticklabels([])
        if j == 0:
            axes[i, j].set_ylabel('Percent Loss')

        for h in set(df_disco['hostname']):
            if 'mlab1.' + site in h:
                #ds = df_disco[ df_disco['hostname'] == h]
                ds = df_disco[ (df_disco['hostname'] == h) & (df_disco['total_discards'] > 100)& (df_disco['total_discards'] < 1000000)]

                ratio = 100 * ds['total_discards'] / ds['total_packets']
                axes[i, j].plot_date(dates.epoch2num(ds['ts']), ratio, ls='-', ms=0, label=h[:11])
        axes[i, j].set_ylim(10**-4, 10**-1)
        axes[i, j].tick_params(axis='x', labelrotation=90)
        axes[i, j].grid(color='#dddddd')
        axes[i, j].legend(loc=4, fontsize='x-small')
        axes[i, j].semilogy()
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle('Switch Packet Loss Rate')

Text(0.5,0.98,u'Switch Packet Loss Rate')

# Flow-Control Trial (measurement-lab.public)

In [208]:
# cat sidestream.sql | bq query --format=csv --max_rows=1000000 --nouse_legacy_sql > sidestream-trial-6w.csv
#df = pd.read_csv('sidestream-trial-6w.csv')

result = query.sync_query("""
#standardSQL                                                                    
    -- Only works for mlab1 addresses. May not work on all machines.
CREATE TEMPORARY FUNCTION sliceFromIP(ipaddr STRING)
    AS ( MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 );

CREATE TEMPORARY FUNCTION betweenTimes(ts INT64, starttime STRING, endtime STRING)
    AS ( TIMESTAMP_SECONDS(ts) >= TIMESTAMP(starttime) AND TIMESTAMP_SECONDS(ts) <= TIMESTAMP(endtime) );

SELECT
    CASE 
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 1 THEN 'ndt'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'samknows'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'neubot'
        ELSE 'other' 
    END AS slice,
    CASE
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-01-26 00:00:00", "2018-01-27 00:00:00") THEN '5w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-02 00:00:00", "2018-02-03 00:00:00") THEN '4w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-09 00:00:00", "2018-02-10 00:00:00") THEN '3w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-16 00:00:00", "2018-02-17 00:00:00") THEN '2w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-23 00:00:00", "2018-02-24 00:00:00") THEN '1w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-02 00:00:00", "2018-03-03 00:00:00") THEN '0w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-09 00:00:00", "2018-03-10 00:00:00") THEN '+1w'
    ELSE 'unknown'                                                                    
    END AS period,
    REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
    web100_log_entry.snap.StartTimeStamp AS ts,                                   
    8 * (web100_log_entry.snap.HCThruOctetsAcked /                                
      (web100_log_entry.snap.SndLimTimeRwin +                                     
       web100_log_entry.snap.SndLimTimeCwnd +                                      
       web100_log_entry.snap.SndLimTimeSnd)) as rate_mbps   
FROM
    `measurement-lab.public.sidestream`

WHERE
        (  betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-01-26 00:00:00", "2018-01-27 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-02 00:00:00", "2018-02-03 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-09 00:00:00", "2018-02-10 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-16 00:00:00", "2018-02-17 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-23 00:00:00", "2018-02-24 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-02 00:00:00", "2018-03-03 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-09 00:00:00", "2018-03-10 00:00:00")
        )
  AND REGEXP_CONTAINS(test_id, r"mlab1.(dfw\d\d)")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000 -- 819200                          
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
  AND (web100_log_entry.snap.State = 1 OR                                       
    (web100_log_entry.snap.State >= 5 AND                                       
    web100_log_entry.snap.State <= 11))
    
GROUP BY
  hostname, slice, period, ts, rate_mbps
""")
df_ss = pd.DataFrame(result)

In [204]:
print len(df_ss), max(df_ss['rate_mbps']), int(math.sqrt(len(df_ss['rate_mbps'])))

hosts = [
    ['mlab1.lga03', 'mlab1.lga04'],
    ['mlab1.lga05', 'mlab1.lga06'],
]

fig, axes = plt.subplots(nrows=2, ncols=2)

for i, host_row in enumerate(hosts):
    for j, host in enumerate(host_row):
        for period in ['+1w', '0w', '1w', '2w', '3w']: #, '4w', '5w']:
            ds = df_ss[ (df_ss['period'] == period) & (df_ss['hostname'] == host) & (df_ss['slice'] == 'ndt') ]
            label = 'pdf-%s (%d)' % (period, len(ds['rate_mbps']))
            if len(ds) == 0:
                continue
            r = [math.log10(x) for x in ds['rate_mbps']]
            n, bins, patches = axes[i, j].hist(r, int(math.sqrt(len(ds['rate_mbps']))),
                                       histtype='step', normed=1, label=label, ls='-')
#            n, bins, patches = axes[i, j].hist(ds['rate_mbps'], int(math.sqrt(len(ds['rate_mbps']))),
#                               histtype='step', label=label, ls='-')
#            n, bins, patches = axes[i, j].hist(ds['rate_mbps'], len(ds['rate_mbps']),
#                               histtype='step', normed=1, cumulative=True, label='cdf-' + period,
#                               ls='-')

        axes[i, j].set_xlim(math.log10(0.1), math.log10(1000))
        axes[i, j].set_axisbelow(True)
        axes[i, j].legend(loc=2)
        axes[i, j].grid(color='#dddddd')
        axes[i, j].set_title(host)
        axes[i, j].xaxis.set_major_formatter(customFormatter)

fig.suptitle('Sidestream Download Rate PDFs over three week period (0w is trial)')
plt.show()
print len(bins)



730671 1303.9458692 854
94


# Historical (mlab-sandbox.batch) - Sidestream by Period & Slice

In [214]:
result = query.sync_query(
    """#standardSQL                                                                    
    -- Only works for mlab1 addresses. May not work on all machines.
CREATE TEMPORARY FUNCTION sliceFromIP(ipaddr STRING)
    AS ( MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 );

CREATE TEMPORARY FUNCTION betweenTimes(ts INT64, starttime STRING, endtime STRING)
    AS ( TIMESTAMP_SECONDS(ts) >= TIMESTAMP(starttime) AND TIMESTAMP_SECONDS(ts) <= TIMESTAMP(endtime) );

SELECT
    CASE 
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 1 THEN 'ndt'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'samknows'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'neubot'
        ELSE 'other' 
    END AS slice,
    CASE                                                                          
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-07-26 00:00:00", "2017-07-30 00:00:00") THEN '07-26 to 29'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-08-12 00:00:00", "2017-08-16 00:00:00") THEN '08-12 to 16'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-10-30 00:00:00", "2017-11-02 00:00:00") THEN '10-30 to 11-02'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-11-29 00:00:00", "2017-12-03 00:00:00") THEN '11-29 to 12-03'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-21 00:00:00", "2018-02-25 00:00:00") THEN '02-21 to 25'
    ELSE 'bad'                                                                    
    END AS period,
    REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
    web100_log_entry.snap.StartTimeStamp AS ts,                                   
    8 * (web100_log_entry.snap.HCThruOctetsAcked /                                
      (web100_log_entry.snap.SndLimTimeRwin +                                     
       web100_log_entry.snap.SndLimTimeCwnd +                                      
       web100_log_entry.snap.SndLimTimeSnd)) as rate_mbps   
FROM
    `mlab-sandbox.batch.sidestream*`                                              
WHERE
        (  betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-07-26 00:00:00", "2017-07-30 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-08-12 00:00:00", "2017-08-16 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-11-29 00:00:00", "2017-12-03 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-21 00:00:00", "2018-02-25 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-10-30 00:00:00", "2017-11-02 00:00:00") )
  AND (test_id LIKE '%mlab1.dfw%')            
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000 -- 819200                          
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
  AND (web100_log_entry.snap.State = 1 OR                                       
    (web100_log_entry.snap.State >= 5 AND                                       
    web100_log_entry.snap.State <= 11))
    
GROUP BY
  hostname, slice, period, ts, rate_mbps
    """)
df_ss = pd.DataFrame(result)

## Sidestream CDFs

In [215]:
title = 'CDF - per-slice sidestream Download Rate CDFs'
print len(df_ss), max(df_ss['rate_mbps']) # 'sqrt', int(math.sqrt(len(df['rate_mbps'])))

hosts = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04'],
]

fig, axes = plt.subplots(nrows=2, ncols=len(hosts[0]), figsize=(13, 10))

for i, host_row in enumerate(hosts):
    for j, host in enumerate(host_row):
        if len(df_ss[ df_ss['hostname'] == host ]) == 0:
            print 'skipping', host
            continue
        for period in ['08-12 to 16', '11-29 to 12-03', '02-21 to 25']: #set(df['period']):
            ds = df_ss[ (df_ss['period'] == period) & (df_ss['hostname'] == host) ]
            for k, slicename in enumerate(['ndt', 'samknows']): # set(ds['slice']):
                #print 'sqrt', int(math.sqrt(len(ds['rate_mbps'])))
                d = ds[ ds['slice'] == slicename ]
                if len(d) == 0:
                    continue
                ax = axes[k, j]
                #n, bins, patches = ax.hist(ds['rate_mbps'], len(ds['rate_mbps']),
                #                   histtype='step', normed=1, cumulative=True, label='cdf-' + period,
                #                   ls='-')
                r = [math.log10(x) for x in d['rate_mbps']]
                n, bins, patches = ax.hist(d['rate_mbps'], int(math.sqrt(len(d['rate_mbps']))),
                                   histtype='step', normed=1, cumulative=True, label=('cdf-' + period + '-' + slicename), 
                                   ls='-')

                ax.set_xlim(1, 200)       
                ax.set_axisbelow(True)
        #ax.semilogx()
                ax.legend(loc=4, fontsize='x-small')

                ax.grid(color='#dddddd')
                ax.set_title(host)
        #labels = ['%.2f' % math.pow(10, float(l)) for l in ax.get_xticks()]
        #ax.xaxis.set_major_formatter(customFormatter)
        

fig.suptitle(title)

plt.show()
#print n, len(bins)

755497 923.370047096


## Sidestream PDFs

In [216]:
title = 'PDF - per-slice sidestream Download Rate PDFs'
print len(df_ss), max(df_ss['rate_mbps']) # 'sqrt', int(math.sqrt(len(df['rate_mbps'])))

hosts = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04'],
]

fig, axes = plt.subplots(nrows=2, ncols=len(hosts[0]), figsize=(13, 10))
for i, host_row in enumerate(hosts):
    for j, host in enumerate(host_row):
        if len(df_ss[ df_ss['hostname'] == host ]) == 0:
            print 'skipping', host
            continue
        for period in ['08-12 to 16', '11-29 to 12-03', '02-21 to 25']: #set(df['period']):
            ds = df_ss[ (df_ss['period'] == period) & (df_ss['hostname'] == host) ]
            for k, slicename in enumerate(['ndt', 'samknows']): # set(ds['slice']):
                #print 'sqrt', int(math.sqrt(len(ds['rate_mbps'])))
                d = ds[ ds['slice'] == slicename ]
                if len(d) == 0:
                    continue
                ax = axes[k, j]

                label = 'pdf-%s-%s (%d)' % (period, slicename, len(d['rate_mbps']))
                r = [math.log10(x) for x in d['rate_mbps']]
                n, bins, patches = ax.hist(r, int(math.sqrt(len(d['rate_mbps']))),
                                   histtype='step', normed=1, label=label, 
                                   ls='-')

                #ax.set_xlim(1, 200)       
                #ax.semilogx()
                ax.set_ylim(0, 1.4)
                ax.set_axisbelow(True)
                ax.legend(loc=2, fontsize='x-small')

                ax.grid(color='#dddddd')
                ax.set_title(host)
                ax.xaxis.set_major_formatter(customFormatter)
        
fig.suptitle(title)
plt.show()


755497 923.370047096


## PDF, CDF, & Switch - by Site and Slice

In [218]:
title = 'PDF, CDF, Switch - per-slice sidestream Download Rates'
print len(df_ss), max(df_ss['rate_mbps']) # 'sqrt', int(math.sqrt(len(df['rate_mbps'])))

hosts = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04', 'mlab1.dfw05'],
]

slices = ['samknows', 'ndt']

for k, slicename in enumerate(slices):
    fig, axes = plt.subplots(nrows=3, ncols=len(hosts[0]), figsize=(13, 10))
    for i, host_row in enumerate(hosts):
        for j, host in enumerate(host_row):
            if len(df_ss[ df_ss['hostname'] == host ]) == 0:
                print 'skipping', host
                continue
            for period in ['08-12 to 16', '11-29 to 12-03', '02-21 to 25']: #set(df['period']):
                ds = df_ss[ (df_ss['period'] == period) & (df_ss['hostname'] == host) ]

                #print 'sqrt', int(math.sqrt(len(ds['rate_mbps'])))
                d = ds[ ds['slice'] == slicename ]
                if len(d) == 0:
                    continue
                ax = axes[0, j]
                #n, bins, patches = ax.hist(ds['rate_mbps'], len(ds['rate_mbps']),
                #                   histtype='step', normed=1, cumulative=True, label='cdf-' + period,
                #                   ls='-')
                r = [math.log10(x) for x in d['rate_mbps']]
                label = 'cdf-%s-%s (%d)' % (period, slicename, len(d['rate_mbps']))
                n, bins, patches = ax.hist(r, int(math.sqrt(len(d['rate_mbps']))),
                                   histtype='step', normed=1, label=label, ls='-')

                #ax.set_xlim(1, 100)
                #ax.semilogx()
                ax.set_axisbelow(True)
                ax.legend(loc=2, fontsize='x-small')
                ax.grid(color='#dddddd')
                ax.set_title(host)
                labels = ['%.2f' % math.pow(10, float(l)) for l in ax.get_xticks()]
                ax.xaxis.set_major_formatter(customFormatter)
                
                ax = axes[1, j]
                r = [math.log10(x) for x in d['rate_mbps']]
                n, bins, patches = ax.hist(d['rate_mbps'], int(math.sqrt(len(d['rate_mbps']))),
                                   histtype='step', normed=1, cumulative=True, label=label, ls='-')

                ax.set_xlim(1, 200)       
                ax.set_axisbelow(True)
                ax.legend(loc=4, fontsize='x-small')
                ax.grid(color='#dddddd')
                ax.set_title(host)     

    for i, host_row in enumerate(hosts):
        for j, host in enumerate(host_row):
            ax = axes[2, j]
        
            ds = df_disco[ df_disco['hostname'] == host ]
            ax.plot_date(dates.epoch2num(ds['ts']), ds['pct_discards'], ls='-', ms=0, label=host)
        
            ax.set_title(host)
            ax.set_ylim(-0.01, .4)
            ax.tick_params(axis='x', labelrotation=90)
            ax.grid(color='#dddddd')
            ax.legend(loc=4, fontsize='x-small')
        
    fig.suptitle(title)
    plt.show()
#print n, len(bins)

755497 923.370047096
