In [1]:
# Enables figures loading outside of browser.
# If not run, figures will load inline.
%matplotlib

Using matplotlib backend: MacOSX


In [226]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import matplotlib.ticker
import datetime
import collections

# Some matplotlib features are version dependent.
assert(matplotlib.__version__ >= '2.1.2')

# Depends on: pip install --upgrade google-cloud-bigquery
import query

In [21]:
def unlog(x, pos):
    v = math.pow(10, x)
    frac, whole = math.modf(v)
    if frac > 0:
        return '%.1f' % v
    else:
        return '%d' % whole

logFormatter = matplotlib.ticker.FuncFormatter(unlog)

In [158]:
result = query.sync_query("""
#standardSQL
SELECT
  name AS hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,
  SUM(IF(metric = 'switch.discards.uplink.tx', value, 0)) AS total_discards,
  SUM(IF(metric = 'switch.unicast.uplink.tx', value, 0)) AS total_packets,
  SUM(IF(metric = 'switch.octets.uplink.tx', value, 0)) AS total_bytes,
  COUNTIF(metric = 'switch.discards.uplink.tx' AND value > 0) / 8640 AS pct_discards

FROM (
  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    sample.timestamp AS sts,
    sample.value AS value
  FROM
    `mlab-sandbox.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
       metric LIKE 'switch.discards.uplink.tx'
    OR metric LIKE 'switch.unicast.uplink.tx'
    OR metric LIKE 'switch.octets.uplink.tx'
  GROUP BY
    hostname, metric, sts, value
)
WHERE
  name IS NOT NULL
GROUP BY
  hostname, day, ts
ORDER BY
  hostname, day, ts
""")

df_disco = pd.DataFrame(result)

In [159]:
# DISCO RATES 90th PERCENTILE

result = query.sync_query("""
#standardSQL
SELECT
  name AS hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,
  
  APPROX_QUANTILES(value, 101)[ORDINAL(50)] as bytes_50th,
  APPROX_QUANTILES(value, 101)[ORDINAL(90)] as bytes_90th,
  APPROX_QUANTILES(value, 101)[ORDINAL(98)] as bytes_98th,
  APPROX_QUANTILES(value, 101)[ORDINAL(99)] as bytes_99th,
  MAX(value) as bytes_max

FROM (
  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    sample.timestamp AS sts,
    sample.value AS value
  FROM
    `mlab-sandbox.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
    metric LIKE 'switch.octets.uplink.tx'
  GROUP BY
    hostname, metric, sts, value
)
WHERE
  name IS NOT NULL
GROUP BY
  hostname, day, ts
ORDER BY
  hostname, day, ts
""")

df_disco_max = pd.DataFrame(result)

# Discards over time

In [5]:
sites = [
    ['dfw', 'lga', 'iad'],
    ['sea', 'atl', 'den'],
    ['mia', 'nuq', 'ord'],
]

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 10))
for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        if j != 0:
            axes[i, j].set_yticklabels([])
        if i != len(sites)-1:
            axes[i, j].set_xticklabels([])
        for h in set(df_disco['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_disco[ (df_disco['hostname'] == h) & (df_disco['total_discards'] > 100)& (df_disco['total_discards'] < 1000000)]
                axes[i, j].plot_date(dates.epoch2num(ds['ts']), ds['total_discards'], ls='-', ms=0, label=h[6:11])

        axes[i, j].set_title(site)
        axes[i, j].set_ylim(100, 1000000)
        axes[i, j].tick_params(axis='x', labelrotation=90)
        axes[i, j].grid(color='#dddddd')
        axes[i, j].legend(loc=4, fontsize='x-small')
        axes[i, j].semilogy()
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle('Discards over time')

Text(0.5,0.98,'Discards over time')

# Avg Daily Rate over time

In [46]:
sites = [
    ['dfw', 'lga', 'iad'],
    ['sea', 'atl', 'den'],
    ['mia', 'nuq', 'ord'],
]

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 10))
for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        if j != 0:
            axes[i, j].set_yticklabels([])
        if i != len(sites)-1:
            axes[i, j].set_xticklabels([])
        for h in set(df_disco['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_disco[ (df_disco['hostname'] == h) ] # & (df_disco['total_discards'] > 100)& (df_disco['total_discards'] < 1000000)]
                axes[i, j].plot_date(dates.epoch2num(ds['ts']), ds['total_bytes'] / 1000000 / 86400, ls='-', ms=0, label=h[6:11])

        axes[i, j].set_title(site)
        axes[i, j].set_ylim(1, 1000)
        axes[i, j].tick_params(axis='x', labelrotation=90)
        axes[i, j].grid(color='#dddddd')
        axes[i, j].legend(loc=2, fontsize='x-small', ncol=3)
        axes[i, j].semilogy()
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle('Daily Avg Rate over time')

Text(0.5,0.98,'Daily Avg Rate over time')

# 90th Percentile Over time

In [108]:
sites = [
    ['dfw', 'lga', 'iad'],
    ['lax', 'atl', 'den'],
    ['sea', 'nuq', 'ord'], # MIA is low utilization.
]

sites = [
    ['dfw', 'lga', 'iad'],
    ['lax', 'atl',  'nuq'], #  'ord', # MIA is low utilization. 'den', 'sea' low enough.
]

cols = len(sites[0])
fig = plt.figure(figsize=(4 * cols, 6))
axes = [
    [None] * cols,
    [None] * cols,
    #[None] * cols,
]

for r, siter in enumerate(sites):
    for c, site in enumerate(siter):
        for x, rate in enumerate(['98th']):
            axes[r][c] = plt.subplot2grid((2, cols), (r, c))
            if c != 0:
                axes[r][c].set_yticklabels([])
            else:
                axes[r][c].set_ylabel('Mbps')

            if r != 1:
                axes[r][c].set_xticklabels([])

            prefix = 'mlab1.' + site
            ds_sites = df_disco_max[ df_disco_max['hostname'].str.contains(prefix) ]
            for h in sorted(set(ds_sites[ ds_sites['hostname'].str.contains(prefix) ]['hostname'])):
                ds = ds_sites[ (ds_sites['hostname'].str.contains(h)) ]
                axes[r][c].plot_date(dates.epoch2num(ds['ts']), ds['bytes_' + rate] * 8 / 10000000, ls='-', ms=0, label=h[6:11] + '-' +  rate)

            axes[r][c].set_title(site)
            axes[r][c].set_ylim(100, 1000)
            axes[r][c].tick_params(axis='x', labelrotation=90)
            axes[r][c].grid(color='#dddddd')
            axes[r][c].legend(loc=2, fontsize='x-small', ncol=2)

fig.suptitle('Daily Percentile Rates')
#fig.tight_layout()
#fig.subplots_adjust(hspace=0.2, wspace=0.2)

plt.show()

## SS COUNTS

In [222]:
# [ 'lga', nuq'], #  'ord', # MIA is low utilization. 'den', 'sea' low enough.

sites = [
    ['dfw', 'iad', 'lax', 'atl', 'lga'],
]

cols = len(sites[0])
fig = plt.figure(figsize=(4 * cols, 6))
axes = [
    [None] * cols,
    [None] * cols,
]

for r, siter in enumerate(sites):
    for c, site in enumerate(siter):
        for x, rate in enumerate(['98th']):
            r = 1
            axes[r][c] = plt.subplot2grid((2, cols), (r, c))
            if c != 0:
                #axes[r][c].set_yticklabels([])
                pass
            else:
                axes[r][c].set_ylabel('Connection Counts')

            if r != 1:
                axes[r][c].set_xticklabels([])

            prefix = 'mlab1.' + site
            ds_sites = df_ss_count[ df_ss_count['hostname'].str.contains(prefix) ]
            for h in sorted(set(ds_sites[ ds_sites['hostname'].str.contains(prefix) ]['hostname'])):
                ds = ds_sites[ (ds_sites['hostname'].str.contains(h)) ]
                axes[r][c].plot_date(dates.epoch2num(ds['ts']), ds['count'], ls='-', ms=0, label=h[6:11])

            axes[r][c].set_title(site)
            axes[r][c].set_ylim(0, 25000)
            axes[r][c].tick_params(axis='x', labelrotation=90)
            axes[r][c].grid(color='#dddddd')
            axes[r][c].legend(loc=2, fontsize='x-small', ncol=2)
            
    for c, site in enumerate(siter):
        for r in [0]:
            axes[r][c] = plt.subplot2grid((2, cols), (r, c))
            if c != 0:
                axes[r][c].set_yticklabels([])
            else:
                axes[r][c].set_ylabel('Mbps')

            if r != 1:
                axes[r][c].set_xticklabels([])

            prefix = 'mlab1.' + site
            ds_sites = df_disco_max[ df_disco_max['hostname'].str.contains(prefix) ]
            for h in sorted(set(ds_sites[ ds_sites['hostname'].str.contains(prefix) ]['hostname'])):
                ds = ds_sites[ (ds_sites['hostname'].str.contains(h)) ]
                axes[r][c].plot_date(dates.epoch2num(ds['ts']), ds['bytes_' + rate] * 8 / 10000000, ls='-', ms=0, label=h[6:11] + '-' +  rate)

            axes[r][c].set_title(site)
            axes[r][c].set_ylim(100, 1000)
            axes[r][c].tick_params(axis='x', labelrotation=90)
            axes[r][c].grid(color='#dddddd')
            axes[r][c].legend(loc=2, fontsize='x-small', ncol=2)

fig.suptitle('Daily 98th Percentile Switch Traffic & TCP Connection Counts Per Metro')
#fig.tight_layout()
#fig.subplots_adjust(hspace=0.2, wspace=0.2)

plt.show()

In [89]:
print set(df_disco_max[df_disco_max['hostname'].str.contains('mlab1.dfw')]['hostname'])

set([u'mlab1.dfw06', u'mlab1.dfw05', u'mlab1.dfw04', u'mlab1.dfw03', u'mlab1.dfw02', u'mlab1.dfw01'])


# Percent of Timebins with Discards 

In [6]:
title = 'Daily percentage of timebins with any discards'
sites = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04'],
]

fig, axes = plt.subplots(nrows=1, ncols=len(sites[0]))
for i, hosts in enumerate(sites):
    for j, host in enumerate(hosts): 
        ax = axes[j]
        
        ds = df_disco[ df_disco['hostname'] == host ]
        ax.plot_date(dates.epoch2num(ds['ts']), ds['pct_discards'], ls='-', ms=0, label=host)
        
        ax.set_title(host)
        ax.set_ylim(-0.01, .4)
        ax.tick_params(axis='x', labelrotation=90)
        ax.grid(color='#dddddd')
        ax.legend(loc=4, fontsize='x-small')
        
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle(title)

Text(0.5,0.98,u'Daily percentage of timebins with any discards')

# Total Packets

In [58]:
title = 'Daily percentage of timebins with any discards'
sites = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04'],
]

fig, axes = plt.subplots(nrows=1, ncols=len(sites[0]))
for i, hosts in enumerate(sites):
    for j, host in enumerate(hosts): 
        ax = axes[j]
        
        ds = df_disco[ df_disco['hostname'] == host ]
        ax.plot_date(dates.epoch2num(ds['ts']), ds['total_packets'], ls='-', ms=0, label=host)
        
        ax.set_title(host)
        #ax.set_ylim(-0.01, .4)
        ax.tick_params(axis='x', labelrotation=90)
        ax.grid(color='#dddddd')
        ax.legend(loc=4, fontsize='x-small')
        
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle(title)

Text(0.5,0.98,u'Daily percentage of timebins with any discards')

# Total Packet Discard Ratios (Switch Loss Rate)

In [6]:
sites = [
    ['dfw', 'lga', 'iad'],
    ['sea', 'atl', 'den'],
    ['mia', 'nuq', 'ord'],
]

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 10))
for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        axes[i, j].set_title(site)
        if j != 0:
            axes[i, j].set_yticklabels([])
        if i != len(sites)-1:
            axes[i, j].set_xticklabels([])
        if j == 0:
            axes[i, j].set_ylabel('Daily Loss Ratio')

        for h in set(df_disco['hostname']):
            if 'mlab1.' + site in h:
                ds = df_disco[ (df_disco['hostname'] == h) &
                               (df_disco['total_discards'] > 100) &
                               (df_disco['total_discards'] < 1000000) ]
                ratio = ds['total_discards'] / ds['total_packets']
                axes[i, j].plot_date(dates.epoch2num(ds['ts']), ratio, ls='-', ms=0, label=h[:11])
        axes[i, j].set_ylim(10**-6, 10**-3)
        axes[i, j].tick_params(axis='x', labelrotation=90)
        axes[i, j].grid(color='#dddddd')
        axes[i, j].legend(loc=4, fontsize='x-small')
        axes[i, j].semilogy()
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle('Switch Packet Loss Rate')

Text(0.5,0.98,u'Switch Packet Loss Rate')

# Flow-Control Trial (measurement-lab.public)

In [51]:
# cat sidestream.sql | bq query --format=csv --max_rows=1000000 --nouse_legacy_sql > sidestream-trial-6w.csv
#df = pd.read_csv('sidestream-trial-6w.csv')

result = query.sync_query("""
#standardSQL                                                                    
    -- Only works for mlab1 addresses. May not work on all machines.
CREATE TEMPORARY FUNCTION sliceFromIP(ipaddr STRING)
    AS ( MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 );

CREATE TEMPORARY FUNCTION betweenTimes(ts INT64, starttime STRING, endtime STRING)
    AS ( TIMESTAMP_SECONDS(ts) >= TIMESTAMP(starttime) AND TIMESTAMP_SECONDS(ts) <= TIMESTAMP(endtime) );

SELECT
    CASE 
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 1 THEN 'ndt'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'samknows'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 9 THEN 'neubot'
        ELSE 'other' 
    END AS slice,
    CASE
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-01-26 00:00:00", "2018-01-27 00:00:00") THEN '5w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-02 00:00:00", "2018-02-03 00:00:00") THEN '4w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-09 00:00:00", "2018-02-10 00:00:00") THEN '3w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-16 00:00:00", "2018-02-17 00:00:00") THEN '2w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-23 00:00:00", "2018-02-24 00:00:00") THEN '1w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-02 00:00:00", "2018-03-03 00:00:00") THEN '0w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-09 00:00:00", "2018-03-10 00:00:00") THEN '+1w'
    ELSE 'unknown'                                                                    
    END AS period,
    REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
    web100_log_entry.snap.StartTimeStamp AS ts,                                   
    8 * (web100_log_entry.snap.HCThruOctetsAcked /                                
      (web100_log_entry.snap.SndLimTimeRwin +                                     
       web100_log_entry.snap.SndLimTimeCwnd +                                      
       web100_log_entry.snap.SndLimTimeSnd)) as rate_mbps   
FROM
   -- `measurement-lab.public.sidestream`
   -- `mlab-sandbox.batch.sidestream*`
    `mlab-sandbox.gfr.sidestream_*`
WHERE
        (  betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-01-26 00:00:00", "2018-01-27 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-02 00:00:00", "2018-02-03 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-09 00:00:00", "2018-02-10 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-16 00:00:00", "2018-02-17 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-23 00:00:00", "2018-02-24 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-02 00:00:00", "2018-03-03 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-09 00:00:00", "2018-03-10 00:00:00")
        )
  AND REGEXP_CONTAINS(test_id, r"mlab1.(dfw\d\d)")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000 -- 819200                          
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
  AND (web100_log_entry.snap.State = 1 OR                                       
    (web100_log_entry.snap.State >= 5 AND                                       
    web100_log_entry.snap.State <= 11))
    
GROUP BY
  hostname, slice, period, ts, rate_mbps
""")
df_ss_trial = pd.DataFrame(result)

In [167]:
result = query.sync_query("""

CREATE TEMPORARY FUNCTION betweenTimes(ts INT64, starttime STRING, endtime STRING)
    AS ( TIMESTAMP_SECONDS(ts) >= TIMESTAMP(starttime) AND TIMESTAMP_SECONDS(ts) <= TIMESTAMP(endtime) );

SELECT
CASE
WHEN betweenTimes(StartTimeStamp, "2018-02-16 00:00:00", "2018-02-17 00:00:00") THEN CONCAT(sitename, '-2w')
WHEN betweenTimes(StartTimeStamp, "2018-02-23 00:00:00", "2018-02-24 00:00:00") THEN CONCAT(sitename, '-1w')
WHEN betweenTimes(StartTimeStamp, "2018-03-02 00:00:00", "2018-03-03 00:00:00") THEN CONCAT(sitename, '-0w (flow)')
ELSE 'unknown'
END AS test_period,

round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(10)], 2) as q10,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(12)], 2) as q12,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(15)], 2) as q15,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(18)], 2) as q18,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(20)], 2) as q20,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(22)], 2) as q22,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(25)], 2) as q25,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(28)], 2) as q28,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(30)], 2) as q30,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(32)], 2) as q32,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(35)], 2) as q35,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(38)], 2) as q38,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(40)], 2) as q40,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(42)], 2) as q42,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(45)], 2) as q45,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(48)], 2) as q48,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(50)], 2) as q50,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(52)], 2) as q52,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(55)], 2) as q55,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(58)], 2) as q58,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(60)], 2) as q60,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(62)], 2) as q62,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(65)], 2) as q65,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(68)], 2) as q68,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(70)], 2) as q70,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(72)], 2) as q72,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(75)], 2) as q75,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(78)], 2) as q78,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(80)], 2) as q80,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(82)], 2) as q82,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(85)], 2) as q85,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(88)], 2) as q88,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(90)], 2) as q90,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(92)], 2) as q92,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(95)], 2) as q95,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(98)], 2) as q98,
round(APPROX_QUANTILES(rate_mbps, 101) [ORDINAL(100)], 2) as q100,
COUNT(*) as sample_count

FROM
(
SELECT
    UNIX_SECONDS(TIMESTAMP_TRUNC(log_time, DAY)) as StartTimeStamp,
    --  web100_log_entry.snap.StartTimeStamp as StartTimeStamp,
    REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/[0-9]+/mlab1.(dfw02|lga03)/.*") AS sitename,
    8 * (
        web100_log_entry.snap.HCThruOctetsAcked / (
        web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd)
    ) AS rate_mbps
FROM
    -- `mlab-sandbox.batch.sidestream*`
     `mlab-sandbox.gfr.sidestream_*`
WHERE

    REGEXP_CONTAINS(test_id, r"\d\d\d\d/\d\d/[0-9]+/mlab1.(dfw02|lga03)/.*")
    AND (
             betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-16 00:00:00", "2018-02-17 00:00:00")
          OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-23 00:00:00", "2018-02-24 00:00:00")
          OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-02 00:00:00", "2018-03-03 00:00:00"))
    AND web100_log_entry.snap.HCThruOctetsAcked >= 819200
    AND ( web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
    AND ( web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
    AND ( web100_log_entry.snap.State = 1 OR ( web100_log_entry.snap.State >= 5 AND web100_log_entry.snap.State <= 11))
)
GROUP BY
    sitename, test_period
ORDER BY
    sitename, test_period
""")
df_ss_trial_pct = pd.DataFrame(result)




In [199]:

cols = df_ss_trial_pct['test_period']
n = df_ss_trial_pct.drop(['q100', 'sample_count', 'test_period'], axis=1)

t= n.transpose()
t.columns = cols

lines = []
for row in cols:
    lines.append((t[row], row))

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(11,6))

#print lines
for l, label in lines:
    x = [v[1:] for v in list(n.keys())]
    axes.plot(x, l, label=label)

#axes.set_xticklabels(list(n.keys()))
axes.legend(loc=2)
axes.set_ylabel('Mbps')
axes.set_xlabel('Percentiles')
axes.grid(color='#dddddd')
fig.suptitle('Sidestream comparing Flow-control trial to earlier periods')
plt.show()


In [204]:
print len(df_ss), max(df_ss['rate_mbps']), int(math.sqrt(len(df_ss['rate_mbps'])))

hosts = [
    ['mlab1.lga03', 'mlab1.lga04'],
    ['mlab1.lga05', 'mlab1.lga06'],
]

fig, axes = plt.subplots(nrows=2, ncols=2)

for i, host_row in enumerate(hosts):
    for j, host in enumerate(host_row):
        for period in ['+1w', '0w', '1w', '2w', '3w']: #, '4w', '5w']:
            ds = df_ss[ (df_ss['period'] == period) & (df_ss['hostname'] == host) & (df_ss['slice'] == 'ndt') ]
            label = 'pdf-%s (%d)' % (period, len(ds['rate_mbps']))
            if len(ds) == 0:
                continue
            r = [math.log10(x) for x in ds['rate_mbps']]
            n, bins, patches = axes[i, j].hist(r, int(math.sqrt(len(ds['rate_mbps']))),
                                       histtype='step', normed=1, label=label, ls='-')
#            n, bins, patches = axes[i, j].hist(ds['rate_mbps'], int(math.sqrt(len(ds['rate_mbps']))),
#                               histtype='step', label=label, ls='-')
#            n, bins, patches = axes[i, j].hist(ds['rate_mbps'], len(ds['rate_mbps']),
#                               histtype='step', normed=1, cumulative=True, label='cdf-' + period,
#                               ls='-')

        axes[i, j].set_xlim(math.log10(0.1), math.log10(1000))
        axes[i, j].set_axisbelow(True)
        axes[i, j].legend(loc=2)
        axes[i, j].grid(color='#dddddd')
        axes[i, j].set_title(host)
        axes[i, j].xaxis.set_major_formatter(logFormatter)

fig.suptitle('Sidestream Download Rate PDFs over three week period (0w is trial)')
plt.show()
print len(bins)



730671 1303.9458692 854
94


# Historical (mlab-sandbox.batch) - Sidestream by Period & Slice

In [257]:
# Variations, for each period:
# * all sidestream connections from each period.
# * all sidestream connections from each period and slice
# * all sidestream connections from each period and slice and from same cohort.
# * some sidestream connections from each period and slice and from same cohort, grouped by ts & remote_ip.
# * some sidestream connections from each period and slice and from same cohort, grouped by only by remote_ip.

result = query.sync_query(
    """#standardSQL                                                                    
    -- Only works for mlab1 addresses. May not work on all machines.
CREATE TEMPORARY FUNCTION sliceFromIP(ipaddr STRING)
    AS ( MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 );

CREATE TEMPORARY FUNCTION betweenTimes(ts INT64, starttime STRING, endtime STRING)
    AS ( TIMESTAMP_SECONDS(ts) >= TIMESTAMP(starttime) AND TIMESTAMP_SECONDS(ts) <= TIMESTAMP(endtime) );

SELECT
    CASE 
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 1 THEN 'ndt'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'samknows'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'neubot'
        ELSE 'other' 
    END AS slice,
    CASE                                                                          
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-07-26 00:00:00", "2017-07-30 00:00:00") THEN '07-26 to 29'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-08-12 00:00:00", "2017-08-16 00:00:00") THEN '08-12 to 16'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-10-30 00:00:00", "2017-11-02 00:00:00") THEN '10-30 to 11-02'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-11-26 00:00:00", "2017-12-01 00:00:00") THEN '11-29 to 12-03'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-21 00:00:00", "2018-02-25 00:00:00") THEN '02-21 to 25'
    ELSE 'bad'                                                                    
    END AS period,
    REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
    web100_log_entry.snap.StartTimeStamp AS ts,                                   
    8 * (web100_log_entry.snap.HCThruOctetsAcked /                                
      (web100_log_entry.snap.SndLimTimeRwin +                                     
       web100_log_entry.snap.SndLimTimeCwnd +                                      
       web100_log_entry.snap.SndLimTimeSnd)) as rate_mbps   
FROM
    -- `mlab-sandbox.batch.sidestream*`       
    `mlab-sandbox.gfr.sidestream_*`
WHERE
        (  betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-07-26 00:00:00", "2017-07-30 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-08-12 00:00:00", "2017-08-16 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-11-26 00:00:00", "2017-12-01 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-21 00:00:00", "2018-02-25 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-10-30 00:00:00", "2017-11-02 00:00:00") )
  AND (test_id LIKE '%mlab1.dfw%')            
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000 -- 819200                          
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
  AND (web100_log_entry.snap.State = 1 OR                                       
    (web100_log_entry.snap.State >= 5 AND                                       
    web100_log_entry.snap.State <= 11))
  AND web100_log_entry.connection_spec.remote_ip IN(
    (SELECT
     remote_ip
    FROM (
      SELECT
         web100_log_entry.connection_spec.remote_ip as remote_ip,
         count(*) as c1
      FROM
         -- `mlab-sandbox.batch.sidestream*`
          `mlab-sandbox.gfr.sidestream_*`
      WHERE
          (test_id LIKE '%mlab1.dfw0%')
        AND TIMESTAMP_SECONDS(web100_log_entry.log_time) >= TIMESTAMP("2017-08-12 00:00:00")
        AND TIMESTAMP_SECONDS(web100_log_entry.log_time) < TIMESTAMP("2017-08-16 00:00:00")
        AND sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7
      GROUP BY
        remote_ip
      HAVING c1 > 10
    ) INNER JOIN (
      SELECT
         web100_log_entry.connection_spec.remote_ip as remote_ip,
         count(*) as c2
      FROM
         -- `mlab-sandbox.batch.sidestream*`
          `mlab-sandbox.gfr.sidestream_*`
      WHERE
          (test_id LIKE '%mlab1.dfw0%')
        AND TIMESTAMP_SECONDS(web100_log_entry.log_time) >= TIMESTAMP("2017-11-26 00:00:00")
        AND TIMESTAMP_SECONDS(web100_log_entry.log_time) < TIMESTAMP("2017-11-30 00:00:00")
        AND sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7
      GROUP BY
        remote_ip
      HAVING c2 > 10
    ) USING (remote_ip))
  )
    

GROUP BY
  hostname, slice, period, ts, rate_mbps
    """)
df_ss = pd.DataFrame(result)

In [220]:
#df_ss_count_raw = df_ss_count

result = query.sync_query(
    """#standardSQL                                                                    
CREATE TEMPORARY FUNCTION sliceFromIP(ipaddr STRING)
    AS ( MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 );


SELECT
   hostname, ts, count(*) as count
FROM (
    SELECT
        REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
        UNIX_SECONDS(TIMESTAMP_TRUNC(log_time, DAY)) AS ts                            
    FROM
        -- `mlab-sandbox.batch.sidestream*`                                              
         `mlab-sandbox.gfr.sidestream_*`
    WHERE
      REGEXP_CONTAINS(test_id, r"mlab1.(dfw|lga|iad|lax|atl|nuq)[0-9]{2}.*")     
      AND sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7
      AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000 -- 819200                          
      AND (web100_log_entry.snap.SndLimTimeRwin +                                   
        web100_log_entry.snap.SndLimTimeCwnd +                                      
        web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
      AND (web100_log_entry.snap.SndLimTimeRwin +                                   
        web100_log_entry.snap.SndLimTimeCwnd +                                      
        web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
      AND (web100_log_entry.snap.State = 1 OR                                       
        (web100_log_entry.snap.State >= 5 AND                                       
        web100_log_entry.snap.State <= 11))

    GROUP BY
      hostname, ts, web100_log_entry.connection_spec.remote_ip, web100_log_entry.connection_spec.remote_port, web100_log_entry.connection_spec.local_port, web100_log_entry.connection_spec.local_ip
)

GROUP BY
  hostname, ts
ORDER BY
  hostname, ts
    """)
df_ss_count = pd.DataFrame(result)



In [223]:
hosts = [
    #['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04', 'mlab1.dfw05'],
    #['mlab1.dfw02', 'mlab1.dfw05', 'mlab1.lga02', 'mlab1.lga03', 'mlab1.lga04', 'mlab1.lga05', 'mlab1.lga06'],
    #['mlab1.dfw02', 'mlab1.dfw05', 'mlab1.atl02', 'mlab1.atl03', 'mlab1.atl04', 'mlab1.atl05'],
    #['mlab1.lax02', 'mlab1.lax03', 'mlab1.lax04', 'mlab1.lax05'],
    #['mlab1.dfw02', 'mlab1.dfw05', 'mlab1.iad01', 'mlab1.iad02', 'mlab1.iad03', 'mlab1.iad04', 'mlab1.iad05', 'mlab1.lax02', 'mlab1.lax03', 'mlab1.lax04', 'mlab1.lax05'],
    #['mlab1.lga02', 'mlab1.lga03', 'mlab1.lga04', 'mlab1.lga05', 'mlab1.lga06'],
    #['mlab1.atl02', 'mlab1.atl03', 'mlab1.atl04', 'mlab1.atl05'],
    ['mlab1.dfw02', 'mlab1.dfw05'],
]

periods_list = [
    #(datetime.datetime(2017,  8, 16), datetime.datetime(2017,  8, 23)),
    (datetime.datetime(2017,  8, 23), datetime.datetime(2017,   8, 28)),
    (datetime.datetime(2017,  8, 28), datetime.datetime(2017,  10, 14)),
    #(datetime.datetime(2017, 10, 14), datetime.datetime(2017,  11, 22)),
    #(datetime.datetime(2017, 11, 22), datetime.datetime(2018,  1,  7)),
    (datetime.datetime(2017, 10, 14), datetime.datetime(2017,  12, 7)),
    (datetime.datetime(2017, 12,  7), datetime.datetime(2018,  1,  12)),
    (datetime.datetime(2018,  1, 12), datetime.datetime(2018,  1,  21)),
    (datetime.datetime(2018,  1, 21), datetime.datetime(2018,  2,  7)),
    (datetime.datetime(2018,  2, 7),  datetime.datetime(2018,  3,  10)),
    #(datetime.datetime(2017, 11, 22), datetime.datetime(2018,  2, 21)),
    #(datetime.datetime(2018,  2, 21), datetime.datetime(2018,  3,  7)),
]

In [224]:
# STREAMS WITH MATCHING COHORTS
def start_and_end(d):
    s = d.strftime("%Y-%m-%d %H:%M:%S")
    e = (d + datetime.timedelta(days=4)).strftime("%Y-%m-%d %H:%M:%S")
    return s, e

#df_hosts = []
for i, periods in enumerate(periods_list):
    a_s, a_e = start_and_end(periods[0])
    b_s, b_e = start_and_end(periods[1])
    #df_hosts.append({})
    #if i not in [2]:
    #    print 'skipping', periods_list[i]
    #    continue
    for host in hosts[0]:
        result = query.sync_query("""
#standardSQL                                                                    
    -- Only works for mlab1 addresses. May not work on all machines.
CREATE TEMPORARY FUNCTION sliceFromIP(ipaddr STRING)
    AS ( MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 );

CREATE TEMPORARY FUNCTION betweenTimes(ts INT64, starttime STRING, endtime STRING)
    AS ( TIMESTAMP_SECONDS(ts) >= TIMESTAMP(starttime) AND TIMESTAMP_SECONDS(ts) <= TIMESTAMP(endtime) );

SELECT
    slice,
    period,
    hostname,
    remote_ip,
    AVG(sum_rate_mbps) as sum_rate_mbps

FROM (

SELECT
   slice,
   period,
   hostname,
   remote_ip,
   --AVG(rate_mbps) as rate_mbps,
   --APPROX_QUANTILES(rate_mbps, 101)[ORDINAL(50)] as med_rate_mbps,
   --MAX(rate_mbps) as max_rate_mbps,
   SUM(rate_mbps) as sum_rate_mbps
    
FROM (

SELECT
    web100_log_entry.connection_spec.remote_ip as remote_ip,
    CASE 
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 1 THEN 'ndt'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'samknows'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 9 THEN 'neubot'
        ELSE 'other' 
    END AS slice,
    CASE                                                                          
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+a_s+"""', '"""+a_e+"""')
            THEN '"""+a_s+"""'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+b_s+"""', '"""+b_e+"""')
            THEN '"""+b_s+"""'
    ELSE 'bad'                                                                    
    END AS period,
    REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
    web100_log_entry.snap.StartTimeStamp AS ts,                                   
    8 * (web100_log_entry.snap.HCThruOctetsAcked /                                
      (web100_log_entry.snap.SndLimTimeRwin +                                     
       web100_log_entry.snap.SndLimTimeCwnd +                                      
       web100_log_entry.snap.SndLimTimeSnd)) as rate_mbps   
FROM
    `mlab-sandbox.batch.sidestream*`                                              
WHERE
        (  betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+a_s+"""', '"""+a_e+"""')
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+b_s+"""', '"""+b_e+"""')
        )
  AND (test_id LIKE '%"""+host+"""%')            
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000 -- 819200                          
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
  AND (web100_log_entry.snap.State = 1 OR                                       
    (web100_log_entry.snap.State >= 5 AND                                       
    web100_log_entry.snap.State <= 11))
  AND web100_log_entry.connection_spec.remote_ip IN(
    (SELECT
     remote_ip
    FROM (
      SELECT
         web100_log_entry.connection_spec.remote_ip as remote_ip,
         count(*) as c1
      FROM
         `mlab-sandbox.batch.sidestream*`
      WHERE
          (test_id LIKE '%"""+host+"""%')
        AND betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+a_s+"""', '"""+a_e+"""')
        AND sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7
        AND web100_log_entry.snap.HCThruOctetsAcked >= 819200                          
        AND (web100_log_entry.snap.SndLimTimeRwin +                                   
            web100_log_entry.snap.SndLimTimeCwnd +                                      
            web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
        AND (web100_log_entry.snap.SndLimTimeRwin +                                   
            web100_log_entry.snap.SndLimTimeCwnd +                                      
            web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
        AND (web100_log_entry.snap.State = 1 OR                                       
            (web100_log_entry.snap.State >= 5 AND                                       
            web100_log_entry.snap.State <= 11))
      GROUP BY
        remote_ip
      HAVING c1 > 10
    ) INNER JOIN (
      SELECT
         web100_log_entry.connection_spec.remote_ip as remote_ip,
         count(*) as c2
      FROM
         `mlab-sandbox.batch.sidestream*`
      WHERE
          (test_id LIKE '%"""+host+"""%')
        AND betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+b_s+"""', '"""+b_e+"""')
        AND sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7
                AND web100_log_entry.snap.HCThruOctetsAcked >=  819200                          
        AND (web100_log_entry.snap.SndLimTimeRwin +                                   
            web100_log_entry.snap.SndLimTimeCwnd +                                      
            web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
        AND (web100_log_entry.snap.SndLimTimeRwin +                                   
            web100_log_entry.snap.SndLimTimeCwnd +                                      
            web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
        AND (web100_log_entry.snap.State = 1 OR                                       
            (web100_log_entry.snap.State >= 5 AND                                       
            web100_log_entry.snap.State <= 11))
      GROUP BY
        remote_ip
      HAVING c2 > 10
    ) USING (remote_ip))
  )
    
GROUP BY
  hostname, slice, period, ts, web100_log_entry.connection_spec.remote_ip, rate_mbps
)

GROUP BY
  hostname, slice, period, ts,  remote_ip
)

GROUP BY
  hostname, slice, period, remote_ip
""")
        df_hosts[i][host] = pd.DataFrame(result)
        print 'saved', periods_list[i], host, len(df_hosts[i][host])
#df_ss = pd.DataFrame(result)



saved (datetime.datetime(2017, 8, 23, 0, 0), datetime.datetime(2017, 8, 28, 0, 0)) mlab1.dfw02 190




saved (datetime.datetime(2017, 8, 23, 0, 0), datetime.datetime(2017, 8, 28, 0, 0)) mlab1.dfw05 318




saved (datetime.datetime(2017, 8, 28, 0, 0), datetime.datetime(2017, 10, 14, 0, 0)) mlab1.dfw02 104




saved (datetime.datetime(2017, 8, 28, 0, 0), datetime.datetime(2017, 10, 14, 0, 0)) mlab1.dfw05 233




saved (datetime.datetime(2017, 10, 14, 0, 0), datetime.datetime(2017, 12, 7, 0, 0)) mlab1.dfw02 128




saved (datetime.datetime(2017, 10, 14, 0, 0), datetime.datetime(2017, 12, 7, 0, 0)) mlab1.dfw05 237




saved (datetime.datetime(2017, 12, 7, 0, 0), datetime.datetime(2018, 1, 12, 0, 0)) mlab1.dfw02 192




saved (datetime.datetime(2017, 12, 7, 0, 0), datetime.datetime(2018, 1, 12, 0, 0)) mlab1.dfw05 323




saved (datetime.datetime(2018, 1, 12, 0, 0), datetime.datetime(2018, 1, 21, 0, 0)) mlab1.dfw02 383




saved (datetime.datetime(2018, 1, 12, 0, 0), datetime.datetime(2018, 1, 21, 0, 0)) mlab1.dfw05 569




saved (datetime.datetime(2018, 1, 21, 0, 0), datetime.datetime(2018, 2, 7, 0, 0)) mlab1.dfw02 250




saved (datetime.datetime(2018, 1, 21, 0, 0), datetime.datetime(2018, 2, 7, 0, 0)) mlab1.dfw05 367




saved (datetime.datetime(2018, 2, 7, 0, 0), datetime.datetime(2018, 3, 10, 0, 0)) mlab1.dfw02 162




saved (datetime.datetime(2018, 2, 7, 0, 0), datetime.datetime(2018, 3, 10, 0, 0)) mlab1.dfw05 214


In [210]:
print df_hosts[6].keys()

['mlab1.dfw05', 'mlab1.dfw02', 'mlab1.iad04', 'mlab1.iad05', 'mlab1.iad01', 'mlab1.iad02', 'mlab1.iad03', 'mlab1.lax03', 'mlab1.lax02', 'mlab1.lax05', 'mlab1.lax04']


In [160]:
#ds = df_hosts[0]['mlab1.dfw02']
#a = ds[ (ds['slice'] == 'samknows') & (ds['period'] == '2017-08-23 00:00:00')]
##b = ds[ (ds['slice'] == 'samknows') & (ds['period'] == '2017-08-28 00:00:00')]
#a, b

#pd.merge(a, b,  how='left', left_on=['hostname', 'remote_ip', 'slice'], right_on = ['hostname', 'remote_ip', 'slice'])

#a.join(b.set_index('remote_ip'), on='remote_ip', lsuffix='_a', rsuffix='_b')

In [10]:
# ALL STREAMS per PERIOD

def start_and_end(d):
    s = d.strftime("%Y-%m-%d %H:%M:%S")
    e = (d + datetime.timedelta(days=4)).strftime("%Y-%m-%d %H:%M:%S")
    return s, e

df_hosts = []
for i, periods in enumerate(periods_list):
    a_s, a_e = start_and_end(periods[0])
    b_s, b_e = start_and_end(periods[1])
    df_hosts.append({})
    #if i not in [2]:
    #    print 'skipping', periods_list[i]
    #    continue
    for host in hosts[0]:
        result = query.sync_query("""
#standardSQL                                                                    
    -- Only works for mlab1 addresses. May not work on all machines.
CREATE TEMPORARY FUNCTION sliceFromIP(ipaddr STRING)
    AS ( MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 );

CREATE TEMPORARY FUNCTION betweenTimes(ts INT64, starttime STRING, endtime STRING)
    AS ( TIMESTAMP_SECONDS(ts) >= TIMESTAMP(starttime) AND TIMESTAMP_SECONDS(ts) <= TIMESTAMP(endtime) );

SELECT
   slice,
   period,
   hostname,
   AVG(rate_mbps) as rate_mbps,
   APPROX_QUANTILES(rate_mbps, 101)[ORDINAL(50)] as med_rate_mbps,
   MAX(rate_mbps) as max_rate_mbps,
   SUM(rate_mbps) as sum_rate_mbps
    
FROM (

SELECT
    web100_log_entry.connection_spec.remote_ip as remote_ip,
    CASE 
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 1 THEN 'ndt'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'samknows'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'neubot'
        ELSE 'other' 
    END AS slice,
    CASE                                                                          
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+a_s+"""', '"""+a_e+"""')
            THEN '"""+a_s+"""'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+b_s+"""', '"""+b_e+"""')
            THEN '"""+b_s+"""'
    ELSE 'bad'                                                                    
    END AS period,
    REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
    web100_log_entry.snap.StartTimeStamp AS ts,                                   
    8 * (web100_log_entry.snap.HCThruOctetsAcked /                                
      (web100_log_entry.snap.SndLimTimeRwin +                                     
       web100_log_entry.snap.SndLimTimeCwnd +                                      
       web100_log_entry.snap.SndLimTimeSnd)) as rate_mbps   
FROM
    `mlab-sandbox.batch.sidestream*`                                              
WHERE
        (  betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+a_s+"""', '"""+a_e+"""')
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+b_s+"""', '"""+b_e+"""')
        )
  AND (test_id LIKE '%"""+host+"""%')            
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000 -- 819200                          
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
  AND (web100_log_entry.snap.State = 1 OR                                       
    (web100_log_entry.snap.State >= 5 AND                                       
    web100_log_entry.snap.State <= 11))
    
GROUP BY
  hostname, slice, period, ts, web100_log_entry.connection_spec.remote_ip, rate_mbps
)

GROUP BY
  hostname, slice, period,   remote_ip -- CAST(ts/4 AS INT64),
""")
        df_hosts[i][host] = pd.DataFrame(result)
        print 'saved', periods_list[i], host, len(df_hosts[i][host])
#df_ss = pd.DataFrame(result)

saved (datetime.datetime(2017, 8, 16, 0, 0), datetime.datetime(2017, 8, 23, 0, 0)) mlab1.dfw02 31008
saved (datetime.datetime(2017, 8, 16, 0, 0), datetime.datetime(2017, 8, 23, 0, 0)) mlab1.dfw03 29927
saved (datetime.datetime(2017, 8, 16, 0, 0), datetime.datetime(2017, 8, 23, 0, 0)) mlab1.dfw04 29941
saved (datetime.datetime(2017, 8, 16, 0, 0), datetime.datetime(2017, 8, 23, 0, 0)) mlab1.dfw05 31220
saved (datetime.datetime(2017, 8, 23, 0, 0), datetime.datetime(2017, 8, 28, 0, 0)) mlab1.dfw02 31113
saved (datetime.datetime(2017, 8, 23, 0, 0), datetime.datetime(2017, 8, 28, 0, 0)) mlab1.dfw03 30294
saved (datetime.datetime(2017, 8, 23, 0, 0), datetime.datetime(2017, 8, 28, 0, 0)) mlab1.dfw04 29945
saved (datetime.datetime(2017, 8, 23, 0, 0), datetime.datetime(2017, 8, 28, 0, 0)) mlab1.dfw05 31137
saved (datetime.datetime(2017, 8, 28, 0, 0), datetime.datetime(2017, 11, 22, 0, 0)) mlab1.dfw02 31078
saved (datetime.datetime(2017, 8, 28, 0, 0), datetime.datetime(2017, 11, 22, 0, 0)) mlab1.

## Sidestream CDFs

In [13]:
title = 'CDF - per-slice sidestream Download Rate CDFs'
print len(df_ss), max(df_ss['rate_mbps']) # 'sqrt', int(math.sqrt(len(df['rate_mbps'])))

hosts = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04'],
]

fig, axes = plt.subplots(nrows=2, ncols=len(hosts[0]), figsize=(13, 10))

for i, host_row in enumerate(hosts):
    for j, host in enumerate(host_row):
        if len(df_ss[ df_ss['hostname'] == host ]) == 0:
            print 'skipping', host
            continue
        for period in ['08-12 to 16', '11-29 to 12-03', '02-21 to 25']: #set(df['period']):
            ds = df_ss[ (df_ss['period'] == period) & (df_ss['hostname'] == host) ]
            for k, slicename in enumerate(['ndt', 'samknows']): # set(ds['slice']):
                #print 'sqrt', int(math.sqrt(len(ds['rate_mbps'])))
                d = ds[ ds['slice'] == slicename ]
                if len(d) == 0:
                    continue
                ax = axes[k, j]
                #n, bins, patches = ax.hist(ds['rate_mbps'], len(ds['rate_mbps']),
                #                   histtype='step', normed=1, cumulative=True, label='cdf-' + period,
                #                   ls='-')
                r = [math.log10(x) for x in d['rate_mbps']]
                n, bins, patches = ax.hist(d['rate_mbps'], int(math.sqrt(len(d['rate_mbps']))),
                                   histtype='step', normed=1, cumulative=True, label=('cdf-' + period + '-' + slicename), 
                                   ls='-')

                ax.set_xlim(1, 200)       
                ax.set_axisbelow(True)
        #ax.semilogx()
                ax.legend(loc=4, fontsize='x-small')

                ax.grid(color='#dddddd')
                ax.set_title(host)
        #labels = ['%.2f' % math.pow(10, float(l)) for l in ax.get_xticks()]
        #ax.xaxis.set_major_formatter(logFormatter)
        

fig.suptitle(title)

plt.show()
#print n, len(bins)

10079 507.5426899233347


## Sidestream PDFs

In [14]:
title = 'PDF - per-slice sidestream Download Rate PDFs'
print len(df_ss), max(df_ss['rate_mbps']) # 'sqrt', int(math.sqrt(len(df['rate_mbps'])))

hosts = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04'],
]

fig, axes = plt.subplots(nrows=2, ncols=len(hosts[0]), figsize=(13, 10))
for i, host_row in enumerate(hosts):
    for j, host in enumerate(host_row):
        if len(df_ss[ df_ss['hostname'] == host ]) == 0:
            print 'skipping', host
            continue
        for period in ['08-12 to 16', '02-21 to 25']: # '11-29 to 12-03',  #set(df['period']):
            ds = df_ss[ (df_ss['period'] == period) & (df_ss['hostname'] == host) ]
            for k, slicename in enumerate(['ndt', 'samknows']): # set(ds['slice']):
                #print 'sqrt', int(math.sqrt(len(ds['rate_mbps'])))
                d = ds[ ds['slice'] == slicename ]
                if len(d) == 0:
                    continue
                ax = axes[k, j]

                label = 'pdf-%s-%s (%d)' % (period, slicename, len(d['rate_mbps']))
                r = [math.log10(x) for x in d['rate_mbps']]
                n, bins, patches = ax.hist(r, int(math.sqrt(len(d['rate_mbps']))),
                                   histtype='step', normed=1, label=label, 
                                   ls='-')

                #ax.set_xlim(1, 200)       
                #ax.semilogx()
                #ax.set_ylim(0, 1.4)
                ax.set_axisbelow(True)
                ax.legend(loc=2, fontsize='x-small')

                ax.grid(color='#dddddd')
                ax.set_title(host)
                ax.xaxis.set_major_formatter(logFormatter)
        
fig.suptitle(title)
plt.show()


10079 507.5426899233347


## PDF, CDF, & Switch - by Site and Slice

In [242]:
# extract metros
metros = set([h[6:9] for h in df_hosts[0].keys()])
print metros
hosts = []
for i, metro in enumerate(metros):
    m = []
    for h in df_hosts[0].keys():
        if metro in h:
            m.append(h)
    hosts.append(sorted(m))
print hosts

set(['iad', 'dfw', 'lga', 'lax', 'atl'])
[['mlab1.iad01', 'mlab1.iad02', 'mlab1.iad03', 'mlab1.iad04', 'mlab1.iad05'], ['mlab1.dfw02', 'mlab1.dfw05'], ['mlab1.lga02', 'mlab1.lga03', 'mlab1.lga04', 'mlab1.lga05', 'mlab1.lga06'], ['mlab1.lax02', 'mlab1.lax03', 'mlab1.lax04', 'mlab1.lax05'], ['mlab1.atl02', 'mlab1.atl03', 'mlab1.atl04', 'mlab1.atl05']]


In [211]:
print df_hosts[0]['mlab1.dfw02']

        hostname               period  \
0    mlab1.dfw02  2017-08-23 00:00:00   
1    mlab1.dfw02  2017-08-23 00:00:00   
2    mlab1.dfw02  2017-08-23 00:00:00   
3    mlab1.dfw02  2017-08-23 00:00:00   
4    mlab1.dfw02  2017-08-23 00:00:00   
5    mlab1.dfw02  2017-08-23 00:00:00   
6    mlab1.dfw02  2017-08-23 00:00:00   
7    mlab1.dfw02  2017-08-23 00:00:00   
8    mlab1.dfw02  2017-08-23 00:00:00   
9    mlab1.dfw02  2017-08-23 00:00:00   
10   mlab1.dfw02  2017-08-23 00:00:00   
11   mlab1.dfw02  2017-08-23 00:00:00   
12   mlab1.dfw02  2017-08-23 00:00:00   
13   mlab1.dfw02  2017-08-23 00:00:00   
14   mlab1.dfw02  2017-08-23 00:00:00   
15   mlab1.dfw02  2017-08-23 00:00:00   
16   mlab1.dfw02  2017-08-23 00:00:00   
17   mlab1.dfw02  2017-08-23 00:00:00   
18   mlab1.dfw02  2017-08-23 00:00:00   
19   mlab1.dfw02  2017-08-23 00:00:00   
20   mlab1.dfw02  2017-08-23 00:00:00   
21   mlab1.dfw02  2017-08-23 00:00:00   
22   mlab1.dfw02  2017-08-23 00:00:00   
23   mlab1.dfw02

In [225]:
title = 'PDF, CDF, Switch - slice sidestream Download Rates'
#print len(df_ss), max(df_ss['rate_mbps']) # 'sqrt', int(math.sqrt(len(df['rate_mbps'])))

d = None
df=None
label2date = {}
#slices = ['samknows', 'ndt']
slices = ['samknows']
colors = plt.cm.Dark2.colors
colors = plt.cm.tab10.colors
p2c = {}
c=0


for i, host_row in enumerate(hosts):
    for j, host in enumerate(host_row):

        rows = 4
        cols = len(periods_list)
        fig = plt.figure(figsize=(4 * cols, 13))
        axes = [
            [None] * cols,
            [None] * cols,
            [None] * cols,
            None,
        ]

        for p, times in enumerate(periods_list):
            axes[0][p] = plt.subplot2grid((rows, cols), (0, p))
            axes[1][p] = plt.subplot2grid((rows, cols), (1, p))
            axes[2][p] = plt.subplot2grid((rows, cols), (2, p))

            for k, slicename in enumerate(slices):
                
                df_ss = df_hosts[p][host]
                if len(df_ss) == 0:
                    print 'skipping', host, 'no data'
                    continue
                if len(df_ss[ df_ss['hostname'] == host ]) == 0:
                    print 'skipping', host
                    continue
                
                t_a, t_b = times
                p_a, p_b = t_a.strftime("%Y-%m-%d %H:%M:%S"), t_b.strftime("%Y-%m-%d %H:%M:%S")
                
                a = df_ss[ (df_ss['slice'] == slicename) & (df_ss['period'] == p_a) ]
                b = df_ss[ (df_ss['slice'] == slicename) & (df_ss['period'] == p_b) ]
                #print 'len ab', len(a), len(b)

                columns = ['hostname', 'remote_ip', 'slice']
                ds = pd.merge(a, b,  how='left', left_on=columns, right_on=columns)
                #print ds   
                for period_str in [p_a, p_b]:
                    if period_str not in p2c:
                        p2c[period_str] = colors[c]
                        c += 1
                
                if True:
                    #ds = df_ss[ (df_ss['period'] == period_str) &
                    #            (df_ss['hostname'] == host) &
                    #            (df_ss['slice'] == slicename) ]
                    if len(ds['sum_rate_mbps_x'].dropna()) == 0 or len(ds['sum_rate_mbps_y'].dropna()) == 0:
                        continue

                # Top
                ax = axes[0][p]
                for period, l in [(t_a, ds['sum_rate_mbps_x']), (t_b, ds['sum_rate_mbps_y'])]:
                    vals = [math.log10(x) for x in l.dropna()]
                    period_str = period.strftime("%Y-%m-%d %H:%M:%S")
                    label = 'pdf-%s-%s (%d)' % (period_str, slicename, len(vals))
                    label2date[label] = period
                    #color = p2c[period_str]
                    
                    #print len(l), len(l.dropna()), int(math.sqrt(len(vals)))
                    #print label
                    #print color
                    
                    sqrt_bins = int(math.sqrt(len(vals)))
                    #print 'bins', sqrt_bins
                    #print period_str, label, color
                    n, bins, patches = ax.hist(
                            vals, sqrt_bins,
                            histtype='step', normed=1, label=label, ls='-', color=p2c[period_str])

                ax.set_axisbelow(True)
                ax.legend(fontsize='x-small', loc='upper center', bbox_to_anchor=(0.5, 1.3))
                ax.grid(color='#dddddd')
                ax.set_title(host)
                ax.xaxis.set_major_formatter(logFormatter)

                # Middle
                ax = axes[1][p]
                for period, l in [(t_a, ds['sum_rate_mbps_x']), (t_b, ds['sum_rate_mbps_y'])]:
                    vals = [math.log10(x) for x in  l.dropna()]
                    period_str = period.strftime("%Y-%m-%d %H:%M:%S")
                    label = 'cdf-%s-%s (%d)' % (period_str, slicename, len(vals))

                    n, bins, patches = ax.hist(vals, len(vals),
                                               histtype='step', normed=1, cumulative=True, label=label, ls='-',
                                               color=p2c[period_str])

                    ax.xaxis.set_major_formatter(logFormatter)
                    ax.set_axisbelow(True)
                    #ax.legend(fontsize='x-small', loc='upper center', bbox_to_anchor=(0.5, 1.3))

                    ax.grid(color='#dddddd')
                    ax.set_title(host)
                    if p != 0:
                        ax.set_yticklabels([])

                if True:
                    # Bottom
                    #t_a, t_b = times
                    #p_a, p_b = t_a.strftime("%Y-%m-%d %H:%M:%S"), t_b.strftime("%Y-%m-%d %H:%M:%S")
                    #a = df_ss[ (df_ss['slice'] == slicename) & (df_ss['period'] == p_a) ]
                    #b = df_ss[ (df_ss['slice'] == slicename) & (df_ss['period'] == p_b) ]

                    #columns = ['hostname', 'remote_ip', 'slice']
                    #d = pd.merge(a, b,  how='left', left_on=columns, right_on=columns)

                    ax = axes[2][p]
                    
                    label = 'scatter-%s (%d)/(%d)' % (slicename, len(ds['sum_rate_mbps_x']), len(ds['sum_rate_mbps_y']))
                    
                    ax.plot([0.1, 1000], [0.1, 1000], color='r', alpha=0.1)
                    ax.add_patch(
                        matplotlib.patches.Polygon(
                            [[.1, .1], [1000, .1], [1000, 1000], [.1, .1]], closed=True,
                            fill=True, color=p2c[p_b], alpha=0.1))
                    ax.add_patch(
                        matplotlib.patches.Polygon(
                            [[.1, .1], [.1, 1000], [1000, 1000], [.1, .1]], closed=True,
                            fill=True, color=p2c[p_a], alpha=0.1))
                    ax.scatter(ds['sum_rate_mbps_y'], ds['sum_rate_mbps_x'], s=2, alpha=0.3, label=label)
                    
                    #ax.scatter([100], [200])
                    ax.set_xlim(.1, 1000)
                    ax.set_ylim(.1, 1000)
                    
                    #ax.set_xlabel('slow')
                    #ax.set_ylabel('fast')
                    ax.set_xlabel(p_b)
                    ax.set_ylabel(p_a)

                    
                    #ax.xaxis.set_major_formatter(logFormatter)

                    #ax.set_axisbelow(True)
                    ax.grid(color='#dddddd')
                    ax.semilogx()
                    ax.semilogy()
                    ax.legend(fontsize='x-small')

                    #ax.set_title(host)
                    #if p != 0:
                    #    ax.set_yticklabels([])
                        
            axes[0][p].set_xlim(math.log10(.1), math.log10(1100))
            axes[1][p].set_xlim(math.log10(.1), math.log10(1100))

        if True:
            print 'last'
            axes[3] = plt.subplot2grid((rows, cols), (3, 0), colspan=cols)
            ax = axes[3]
        
            ds = df_disco[ df_disco['hostname'] == host ]
            ax.plot_date(dates.epoch2num(ds['ts']), ds['pct_discards'], ls='-', ms=0, label='switch', color='mediumpurple')
        
            ax.set_title(host)
            ax.set_ylim(-0.01, 1)
            ax.tick_params(axis='x', labelrotation=90)
            ax.grid(color='#dddddd')
            #ax.legend(loc=4, fontsize='x-small') 
 
            # Color switch regions with the PDF periods based on legend colors.
            for p in range(0, len(periods_list)):
                h, l = axes[0][p].get_legend_handles_labels()
                for k, line in enumerate(h):
                    s = label2date[l[k]]
                    e = s + datetime.timedelta(days=4)
                    color = h[k].get_edgecolor()
                    ax.axvspan(dates.date2num(s), dates.date2num(e), alpha=0.5, color=color)

            ax.set_ylabel('% discard timebins')                
            ax2 = ax.twinx()
        
            ds = df_ss_count[ df_ss_count['hostname'] == host ]
            ax2.plot_date(dates.epoch2num(ds['ts']), ds['count'], ls='-', ms=0, label='sidestream')

            if p != 4:
                ax2.set_yticklabels([])
            else:
                ax2.set_ylabel('Sidestream Flow Count')

            ax2.grid(color='#dddddd')
            ax.legend(loc=3, fontsize='small') 
            ax2.legend(loc=1, fontsize='small') 

     
        axes[0][0].set_ylabel('PDF')
        axes[1][0].set_ylabel('CDF')
        #axes[2, 0].set_ylabel('% discard timebins')    
        #axes[2, 0].set_ylabel('Sidestream Flow Count')    

        fig.suptitle(title) # + ('\n%s' % [period.strftime("%Y-%m-%d %H:%M:%S") for period in times]))
        fig.subplots_adjust(hspace=0.3, wspace=0.4)
        plt.show()

#plt.hist2d(
#    df_ss[ (df_ss['period'] == '08-12 to 16') & (df_ss['hostname'] == 'mlab1.dfw02') & (df_ss['slice'] == 'samknows') ],
#    df_ss[ (df_ss['period'] == '02-21 to 25') & (df_ss['hostname'] == 'mlab1.dfw02') & (df_ss['slice'] == 'samknows') ],
#    bins=40,
#)
#print n, len(bins)

last
last


In [263]:
#print ds