In [1]:
# Enables figures loading outside of browser.
# If not run, figures will load inline.
%matplotlib

Using matplotlib backend: MacOSX


In [2]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import matplotlib.ticker
import datetime

# Some matplotlib features are version dependent.
assert(matplotlib.__version__ >= '2.1.2')

# Depends on: pip install --upgrade google-cloud-bigquery
import query

In [3]:
def unlog(x, pos):
    return '%.2f' % math.pow(10, x)

customFormatter = matplotlib.ticker.FuncFormatter(unlog)

In [4]:
result = query.sync_query("""
#standardSQL
SELECT
  name AS hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,
  SUM(IF(metric = 'switch.discards.uplink.tx', value, 0)) AS total_discards,
  SUM(IF(metric = 'switch.unicast.uplink.tx', value, 0)) AS total_packets,
  COUNTIF(metric = 'switch.discards.uplink.tx' AND value > 0) / 8640 AS pct_discards

FROM (
  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    sample.timestamp AS sts,
    sample.value AS value
  FROM
    `mlab-sandbox.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
    metric LIKE 'switch.discards.uplink.tx' OR metric LIKE 'switch.unicast.uplink.tx'
  GROUP BY
    hostname, metric, sts, value
)
WHERE
  name IS NOT NULL
GROUP BY
  hostname, day, ts
ORDER BY
  hostname, day, ts
""")

df_disco = pd.DataFrame(result)

# Discards over time

In [5]:
sites = [
    ['dfw', 'lga', 'iad'],
    ['sea', 'atl', 'den'],
    ['mia', 'nuq', 'ord'],
]

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 10))
for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        if j != 0:
            axes[i, j].set_yticklabels([])
        if i != len(sites)-1:
            axes[i, j].set_xticklabels([])
        for h in set(df_disco['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_disco[ (df_disco['hostname'] == h) & (df_disco['total_discards'] > 100)& (df_disco['total_discards'] < 1000000)]
                axes[i, j].plot_date(dates.epoch2num(ds['ts']), ds['total_discards'], ls='-', ms=0, label=h[6:11])

        axes[i, j].set_title(site)
        axes[i, j].set_ylim(100, 1000000)
        axes[i, j].tick_params(axis='x', labelrotation=90)
        axes[i, j].grid(color='#dddddd')
        axes[i, j].legend(loc=4, fontsize='x-small')
        axes[i, j].semilogy()
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle('Discards over time')

Text(0.5,0.98,u'Discards over time')

# Percent of Timebins with Discards 

In [6]:
title = 'Daily percentage of timebins with any discards'
sites = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04'],
]

fig, axes = plt.subplots(nrows=1, ncols=len(sites[0]))
for i, hosts in enumerate(sites):
    for j, host in enumerate(hosts): 
        ax = axes[j]
        
        ds = df_disco[ df_disco['hostname'] == host ]
        ax.plot_date(dates.epoch2num(ds['ts']), ds['pct_discards'], ls='-', ms=0, label=host)
        
        ax.set_title(host)
        ax.set_ylim(-0.01, .4)
        ax.tick_params(axis='x', labelrotation=90)
        ax.grid(color='#dddddd')
        ax.legend(loc=4, fontsize='x-small')
        
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle(title)

Text(0.5,0.98,u'Daily percentage of timebins with any discards')

# Total Packet Discard Ratios (Switch Loss Rate)

In [8]:
sites = [
    ['dfw', 'lga', 'iad'],
    ['sea', 'atl', 'den'],
    ['mia', 'nuq', 'ord'],
]

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 10))
for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        axes[i, j].set_title(site)
        if j != 0:
            axes[i, j].set_yticklabels([])
        if i != len(sites)-1:
            axes[i, j].set_xticklabels([])
        if j == 0:
            axes[i, j].set_ylabel('Percent Loss')

        for h in set(df_disco['hostname']):
            if 'mlab1.' + site in h:
                #ds = df_disco[ df_disco['hostname'] == h]
                ds = df_disco[ (df_disco['hostname'] == h) &
                               (df_disco['total_discards'] > 100) &
                               (df_disco['total_discards'] < 1000000) ]
                ratio = 100 * ds['total_discards'] / ds['total_packets']
                axes[i, j].plot_date(dates.epoch2num(ds['ts']), ratio, ls='-', ms=0, label=h[:11])
        axes[i, j].set_ylim(10**-4, 10**-1)
        axes[i, j].tick_params(axis='x', labelrotation=90)
        axes[i, j].grid(color='#dddddd')
        axes[i, j].legend(loc=4, fontsize='x-small')
        axes[i, j].semilogy()
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle('Switch Packet Loss Rate')

Text(0.5,0.98,'Switch Packet Loss Rate')

# Flow-Control Trial (measurement-lab.public)

In [208]:
# cat sidestream.sql | bq query --format=csv --max_rows=1000000 --nouse_legacy_sql > sidestream-trial-6w.csv
#df = pd.read_csv('sidestream-trial-6w.csv')

result = query.sync_query("""
#standardSQL                                                                    
    -- Only works for mlab1 addresses. May not work on all machines.
CREATE TEMPORARY FUNCTION sliceFromIP(ipaddr STRING)
    AS ( MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 );

CREATE TEMPORARY FUNCTION betweenTimes(ts INT64, starttime STRING, endtime STRING)
    AS ( TIMESTAMP_SECONDS(ts) >= TIMESTAMP(starttime) AND TIMESTAMP_SECONDS(ts) <= TIMESTAMP(endtime) );

SELECT
    CASE 
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 1 THEN 'ndt'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'samknows'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 9 THEN 'neubot'
        ELSE 'other' 
    END AS slice,
    CASE
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-01-26 00:00:00", "2018-01-27 00:00:00") THEN '5w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-02 00:00:00", "2018-02-03 00:00:00") THEN '4w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-09 00:00:00", "2018-02-10 00:00:00") THEN '3w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-16 00:00:00", "2018-02-17 00:00:00") THEN '2w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-23 00:00:00", "2018-02-24 00:00:00") THEN '1w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-02 00:00:00", "2018-03-03 00:00:00") THEN '0w'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-09 00:00:00", "2018-03-10 00:00:00") THEN '+1w'
    ELSE 'unknown'                                                                    
    END AS period,
    REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
    web100_log_entry.snap.StartTimeStamp AS ts,                                   
    8 * (web100_log_entry.snap.HCThruOctetsAcked /                                
      (web100_log_entry.snap.SndLimTimeRwin +                                     
       web100_log_entry.snap.SndLimTimeCwnd +                                      
       web100_log_entry.snap.SndLimTimeSnd)) as rate_mbps   
FROM
    `measurement-lab.public.sidestream`

WHERE
        (  betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-01-26 00:00:00", "2018-01-27 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-02 00:00:00", "2018-02-03 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-09 00:00:00", "2018-02-10 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-16 00:00:00", "2018-02-17 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-23 00:00:00", "2018-02-24 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-02 00:00:00", "2018-03-03 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-03-09 00:00:00", "2018-03-10 00:00:00")
        )
  AND REGEXP_CONTAINS(test_id, r"mlab1.(dfw\d\d)")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000 -- 819200                          
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
  AND (web100_log_entry.snap.State = 1 OR                                       
    (web100_log_entry.snap.State >= 5 AND                                       
    web100_log_entry.snap.State <= 11))
    
GROUP BY
  hostname, slice, period, ts, rate_mbps
""")
df_ss = pd.DataFrame(result)

In [204]:
print len(df_ss), max(df_ss['rate_mbps']), int(math.sqrt(len(df_ss['rate_mbps'])))

hosts = [
    ['mlab1.lga03', 'mlab1.lga04'],
    ['mlab1.lga05', 'mlab1.lga06'],
]

fig, axes = plt.subplots(nrows=2, ncols=2)

for i, host_row in enumerate(hosts):
    for j, host in enumerate(host_row):
        for period in ['+1w', '0w', '1w', '2w', '3w']: #, '4w', '5w']:
            ds = df_ss[ (df_ss['period'] == period) & (df_ss['hostname'] == host) & (df_ss['slice'] == 'ndt') ]
            label = 'pdf-%s (%d)' % (period, len(ds['rate_mbps']))
            if len(ds) == 0:
                continue
            r = [math.log10(x) for x in ds['rate_mbps']]
            n, bins, patches = axes[i, j].hist(r, int(math.sqrt(len(ds['rate_mbps']))),
                                       histtype='step', normed=1, label=label, ls='-')
#            n, bins, patches = axes[i, j].hist(ds['rate_mbps'], int(math.sqrt(len(ds['rate_mbps']))),
#                               histtype='step', label=label, ls='-')
#            n, bins, patches = axes[i, j].hist(ds['rate_mbps'], len(ds['rate_mbps']),
#                               histtype='step', normed=1, cumulative=True, label='cdf-' + period,
#                               ls='-')

        axes[i, j].set_xlim(math.log10(0.1), math.log10(1000))
        axes[i, j].set_axisbelow(True)
        axes[i, j].legend(loc=2)
        axes[i, j].grid(color='#dddddd')
        axes[i, j].set_title(host)
        axes[i, j].xaxis.set_major_formatter(customFormatter)

fig.suptitle('Sidestream Download Rate PDFs over three week period (0w is trial)')
plt.show()
print len(bins)



730671 1303.9458692 854
94


# Historical (mlab-sandbox.batch) - Sidestream by Period & Slice

In [257]:
# Variations, for each period:
# * all sidestream connections from each period.
# * all sidestream connections from each period and slice
# * all sidestream connections from each period and slice and from same cohort.
# * some sidestream connections from each period and slice and from same cohort, grouped by ts & remote_ip.
# * some sidestream connections from each period and slice and from same cohort, grouped by only by remote_ip.

result = query.sync_query(
    """#standardSQL                                                                    
    -- Only works for mlab1 addresses. May not work on all machines.
CREATE TEMPORARY FUNCTION sliceFromIP(ipaddr STRING)
    AS ( MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 );

CREATE TEMPORARY FUNCTION betweenTimes(ts INT64, starttime STRING, endtime STRING)
    AS ( TIMESTAMP_SECONDS(ts) >= TIMESTAMP(starttime) AND TIMESTAMP_SECONDS(ts) <= TIMESTAMP(endtime) );

SELECT
    CASE 
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 1 THEN 'ndt'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'samknows'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'neubot'
        ELSE 'other' 
    END AS slice,
    CASE                                                                          
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-07-26 00:00:00", "2017-07-30 00:00:00") THEN '07-26 to 29'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-08-12 00:00:00", "2017-08-16 00:00:00") THEN '08-12 to 16'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-10-30 00:00:00", "2017-11-02 00:00:00") THEN '10-30 to 11-02'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-11-26 00:00:00", "2017-12-01 00:00:00") THEN '11-29 to 12-03'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-21 00:00:00", "2018-02-25 00:00:00") THEN '02-21 to 25'
    ELSE 'bad'                                                                    
    END AS period,
    REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
    web100_log_entry.snap.StartTimeStamp AS ts,                                   
    8 * (web100_log_entry.snap.HCThruOctetsAcked /                                
      (web100_log_entry.snap.SndLimTimeRwin +                                     
       web100_log_entry.snap.SndLimTimeCwnd +                                      
       web100_log_entry.snap.SndLimTimeSnd)) as rate_mbps   
FROM
    `mlab-sandbox.batch.sidestream*`                                              
WHERE
        (  betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-07-26 00:00:00", "2017-07-30 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-08-12 00:00:00", "2017-08-16 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-11-26 00:00:00", "2017-12-01 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2018-02-21 00:00:00", "2018-02-25 00:00:00")
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, "2017-10-30 00:00:00", "2017-11-02 00:00:00") )
  AND (test_id LIKE '%mlab1.dfw%')            
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000 -- 819200                          
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
  AND (web100_log_entry.snap.State = 1 OR                                       
    (web100_log_entry.snap.State >= 5 AND                                       
    web100_log_entry.snap.State <= 11))
  AND web100_log_entry.connection_spec.remote_ip IN(
    (SELECT
     remote_ip
    FROM (
      SELECT
         web100_log_entry.connection_spec.remote_ip as remote_ip,
         count(*) as c1
      FROM
         `mlab-sandbox.batch.sidestream*`
      WHERE
          (test_id LIKE '%mlab1.dfw0%')
        AND TIMESTAMP_SECONDS(web100_log_entry.log_time) >= TIMESTAMP("2017-08-12 00:00:00")
        AND TIMESTAMP_SECONDS(web100_log_entry.log_time) < TIMESTAMP("2017-08-16 00:00:00")
        AND sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7
      GROUP BY
        remote_ip
      HAVING c1 > 10
    ) INNER JOIN (
      SELECT
         web100_log_entry.connection_spec.remote_ip as remote_ip,
         count(*) as c2
      FROM
         `mlab-sandbox.batch.sidestream*`
      WHERE
          (test_id LIKE '%mlab1.dfw0%')
        AND TIMESTAMP_SECONDS(web100_log_entry.log_time) >= TIMESTAMP("2017-11-26 00:00:00")
        AND TIMESTAMP_SECONDS(web100_log_entry.log_time) < TIMESTAMP("2017-11-30 00:00:00")
        AND sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7
      GROUP BY
        remote_ip
      HAVING c2 > 10
    ) USING (remote_ip))
  )
    
GROUP BY
  hostname, slice, period, ts, rate_mbps
    """)
df_ss = pd.DataFrame(result)

In [13]:
result = query.sync_query(
    """#standardSQL                                                                    

SELECT
   hostname, ts, count(*) as count
FROM (
    SELECT
        REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
        UNIX_SECONDS(TIMESTAMP_TRUNC(log_time, DAY)) AS ts                            
    FROM
        `mlab-sandbox.batch.sidestream*`                                              
    WHERE
     (test_id LIKE '%mlab1.dfw%')            
      AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000 -- 819200                          
      AND (web100_log_entry.snap.SndLimTimeRwin +                                   
        web100_log_entry.snap.SndLimTimeCwnd +                                      
        web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
      AND (web100_log_entry.snap.SndLimTimeRwin +                                   
        web100_log_entry.snap.SndLimTimeCwnd +                                      
        web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
      AND (web100_log_entry.snap.State = 1 OR                                       
        (web100_log_entry.snap.State >= 5 AND                                       
        web100_log_entry.snap.State <= 11))

    GROUP BY
      hostname, ts, web100_log_entry.connection_spec.remote_ip, web100_log_entry.connection_spec.remote_port, web100_log_entry.connection_spec.local_af, web100_log_entry.connection_spec.local_ip
)

GROUP BY
  hostname, ts
ORDER BY
  hostname, ts
    """)
df_ss_count = pd.DataFrame(result)

In [7]:
hosts = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04', 'mlab1.dfw05'],
    #['mlab1.dfw02', 'mlab1.dfw04', 'mlab1.dfw05'],
]

periods_list = [
    (datetime.datetime(2017,  8, 16), datetime.datetime(2017,  8, 23)),
    (datetime.datetime(2017,  8, 23), datetime.datetime(2017,  8, 28)),
    (datetime.datetime(2017,  8, 28), datetime.datetime(2017, 11, 22)),
    (datetime.datetime(2017, 11, 22), datetime.datetime(2018,  2, 21)),
    (datetime.datetime(2018,  2, 21), datetime.datetime(2018,  3,  7)),
]

In [8]:
def start_and_end(d):
    s = d.strftime("%Y-%m-%d %H:%M:%S")
    e = (d + datetime.timedelta(days=4)).strftime("%Y-%m-%d %H:%M:%S")
    return s, e

df_hosts = []
for i, periods in enumerate(periods_list):
    a_s, a_e = start_and_end(periods[0])
    b_s, b_e = start_and_end(periods[1])
    df_hosts.append({})
    #if i not in [2]:
    #    print 'skipping', periods_list[i]
    #    continue
    for host in hosts[0]:
        result = query.sync_query("""
#standardSQL                                                                    
    -- Only works for mlab1 addresses. May not work on all machines.
CREATE TEMPORARY FUNCTION sliceFromIP(ipaddr STRING)
    AS ( MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 );

CREATE TEMPORARY FUNCTION betweenTimes(ts INT64, starttime STRING, endtime STRING)
    AS ( TIMESTAMP_SECONDS(ts) >= TIMESTAMP(starttime) AND TIMESTAMP_SECONDS(ts) <= TIMESTAMP(endtime) );

SELECT
   slice,
   period,
   hostname,
   AVG(rate_mbps) as rate_mbps,
   APPROX_QUANTILES(rate_mbps, 101)[ORDINAL(50)] as med_rate_mbps,
   MAX(rate_mbps) as max_rate_mbps,
   SUM(rate_mbps) as sum_rate_mbps
    
FROM (

SELECT
    web100_log_entry.connection_spec.remote_ip as remote_ip,
    CASE 
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 1 THEN 'ndt'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'samknows'
        WHEN sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7 THEN 'neubot'
        ELSE 'other' 
    END AS slice,
    CASE                                                                          
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+a_s+"""', '"""+a_e+"""')
            THEN '"""+a_s+"""'
        WHEN betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+b_s+"""', '"""+b_e+"""')
            THEN '"""+b_s+"""'
    ELSE 'bad'                                                                    
    END AS period,
    REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
    web100_log_entry.snap.StartTimeStamp AS ts,                                   
    8 * (web100_log_entry.snap.HCThruOctetsAcked /                                
      (web100_log_entry.snap.SndLimTimeRwin +                                     
       web100_log_entry.snap.SndLimTimeCwnd +                                      
       web100_log_entry.snap.SndLimTimeSnd)) as rate_mbps   
FROM
    `mlab-sandbox.batch.sidestream*`                                              
WHERE
        (  betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+a_s+"""', '"""+a_e+"""')
        OR betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+b_s+"""', '"""+b_e+"""')
        )
  AND (test_id LIKE '%"""+host+"""%')            
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000 -- 819200                          
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
  AND (web100_log_entry.snap.State = 1 OR                                       
    (web100_log_entry.snap.State >= 5 AND                                       
    web100_log_entry.snap.State <= 11))
  AND web100_log_entry.connection_spec.remote_ip IN(
    (SELECT
     remote_ip
    FROM (
      SELECT
         web100_log_entry.connection_spec.remote_ip as remote_ip,
         count(*) as c1
      FROM
         `mlab-sandbox.batch.sidestream*`
      WHERE
          (test_id LIKE '%"""+host+"""%')
        AND betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+a_s+"""', '"""+a_e+"""')
        AND sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7
      GROUP BY
        remote_ip
      HAVING c1 > 10
    ) INNER JOIN (
      SELECT
         web100_log_entry.connection_spec.remote_ip as remote_ip,
         count(*) as c2
      FROM
         `mlab-sandbox.batch.sidestream*`
      WHERE
          (test_id LIKE '%"""+host+"""%')
        AND betweenTimes(web100_log_entry.snap.StartTimeStamp, '"""+b_s+"""', '"""+b_e+"""')
        AND sliceFromIP(web100_log_entry.connection_spec.local_ip) = 7
      GROUP BY
        remote_ip
      HAVING c2 > 10
    ) USING (remote_ip))
  )
    
GROUP BY
  hostname, slice, period, ts, web100_log_entry.connection_spec.remote_ip, rate_mbps
)

GROUP BY
  hostname, slice, period,   remote_ip -- CAST(ts/4 AS INT64),
""")
        df_hosts[i][host] = pd.DataFrame(result)
        print 'saved', periods_list[i], host, len(df_hosts[i][host])
#df_ss = pd.DataFrame(result)

saved (datetime.datetime(2017, 8, 16, 0, 0), datetime.datetime(2017, 8, 23, 0, 0)) mlab1.dfw02 586
saved (datetime.datetime(2017, 8, 16, 0, 0), datetime.datetime(2017, 8, 23, 0, 0)) mlab1.dfw03 204
saved (datetime.datetime(2017, 8, 16, 0, 0), datetime.datetime(2017, 8, 23, 0, 0)) mlab1.dfw04 257
saved (datetime.datetime(2017, 8, 16, 0, 0), datetime.datetime(2017, 8, 23, 0, 0)) mlab1.dfw05 685
saved (datetime.datetime(2017, 8, 23, 0, 0), datetime.datetime(2017, 8, 28, 0, 0)) mlab1.dfw02 819
saved (datetime.datetime(2017, 8, 23, 0, 0), datetime.datetime(2017, 8, 28, 0, 0)) mlab1.dfw03 189
saved (datetime.datetime(2017, 8, 23, 0, 0), datetime.datetime(2017, 8, 28, 0, 0)) mlab1.dfw04 227
saved (datetime.datetime(2017, 8, 23, 0, 0), datetime.datetime(2017, 8, 28, 0, 0)) mlab1.dfw05 754
saved (datetime.datetime(2017, 8, 28, 0, 0), datetime.datetime(2017, 11, 22, 0, 0)) mlab1.dfw02 529
saved (datetime.datetime(2017, 8, 28, 0, 0), datetime.datetime(2017, 11, 22, 0, 0)) mlab1.dfw03 101
saved (d

## Sidestream CDFs

In [13]:
title = 'CDF - per-slice sidestream Download Rate CDFs'
print len(df_ss), max(df_ss['rate_mbps']) # 'sqrt', int(math.sqrt(len(df['rate_mbps'])))

hosts = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04'],
]

fig, axes = plt.subplots(nrows=2, ncols=len(hosts[0]), figsize=(13, 10))

for i, host_row in enumerate(hosts):
    for j, host in enumerate(host_row):
        if len(df_ss[ df_ss['hostname'] == host ]) == 0:
            print 'skipping', host
            continue
        for period in ['08-12 to 16', '11-29 to 12-03', '02-21 to 25']: #set(df['period']):
            ds = df_ss[ (df_ss['period'] == period) & (df_ss['hostname'] == host) ]
            for k, slicename in enumerate(['ndt', 'samknows']): # set(ds['slice']):
                #print 'sqrt', int(math.sqrt(len(ds['rate_mbps'])))
                d = ds[ ds['slice'] == slicename ]
                if len(d) == 0:
                    continue
                ax = axes[k, j]
                #n, bins, patches = ax.hist(ds['rate_mbps'], len(ds['rate_mbps']),
                #                   histtype='step', normed=1, cumulative=True, label='cdf-' + period,
                #                   ls='-')
                r = [math.log10(x) for x in d['rate_mbps']]
                n, bins, patches = ax.hist(d['rate_mbps'], int(math.sqrt(len(d['rate_mbps']))),
                                   histtype='step', normed=1, cumulative=True, label=('cdf-' + period + '-' + slicename), 
                                   ls='-')

                ax.set_xlim(1, 200)       
                ax.set_axisbelow(True)
        #ax.semilogx()
                ax.legend(loc=4, fontsize='x-small')

                ax.grid(color='#dddddd')
                ax.set_title(host)
        #labels = ['%.2f' % math.pow(10, float(l)) for l in ax.get_xticks()]
        #ax.xaxis.set_major_formatter(customFormatter)
        

fig.suptitle(title)

plt.show()
#print n, len(bins)

10079 507.5426899233347


## Sidestream PDFs

In [14]:
title = 'PDF - per-slice sidestream Download Rate PDFs'
print len(df_ss), max(df_ss['rate_mbps']) # 'sqrt', int(math.sqrt(len(df['rate_mbps'])))

hosts = [
    ['mlab1.dfw02', 'mlab1.dfw03', 'mlab1.dfw04'],
]

fig, axes = plt.subplots(nrows=2, ncols=len(hosts[0]), figsize=(13, 10))
for i, host_row in enumerate(hosts):
    for j, host in enumerate(host_row):
        if len(df_ss[ df_ss['hostname'] == host ]) == 0:
            print 'skipping', host
            continue
        for period in ['08-12 to 16', '02-21 to 25']: # '11-29 to 12-03',  #set(df['period']):
            ds = df_ss[ (df_ss['period'] == period) & (df_ss['hostname'] == host) ]
            for k, slicename in enumerate(['ndt', 'samknows']): # set(ds['slice']):
                #print 'sqrt', int(math.sqrt(len(ds['rate_mbps'])))
                d = ds[ ds['slice'] == slicename ]
                if len(d) == 0:
                    continue
                ax = axes[k, j]

                label = 'pdf-%s-%s (%d)' % (period, slicename, len(d['rate_mbps']))
                r = [math.log10(x) for x in d['rate_mbps']]
                n, bins, patches = ax.hist(r, int(math.sqrt(len(d['rate_mbps']))),
                                   histtype='step', normed=1, label=label, 
                                   ls='-')

                #ax.set_xlim(1, 200)       
                #ax.semilogx()
                #ax.set_ylim(0, 1.4)
                ax.set_axisbelow(True)
                ax.legend(loc=2, fontsize='x-small')

                ax.grid(color='#dddddd')
                ax.set_title(host)
                ax.xaxis.set_major_formatter(customFormatter)
        
fig.suptitle(title)
plt.show()


10079 507.5426899233347


## PDF, CDF, & Switch - by Site and Slice

In [70]:
title = 'PDF, CDF, Switch - slice sidestream Download Rates'
#print len(df_ss), max(df_ss['rate_mbps']) # 'sqrt', int(math.sqrt(len(df['rate_mbps'])))

d = None
df=None
label2date = {}
slices = ['samknows', 'ndt']
slices = ['samknows']
colors = plt.cm.Dark2.colors
colors = plt.cm.tab10.colors
p2c = {}
c=0
for i, host_row in enumerate(hosts):
    for j, host in enumerate(host_row):
        #fig, axes = plt.subplots(nrows=3, ncols=len(periods_list), figsize=(16, 10))
        fig = plt.figure(figsize=(16, 10))
        axes = [
            [None] * 5,
            [None] * 5,
            None,
        ]
        
        for p, times in enumerate(periods_list):
            axes[0][p] = plt.subplot2grid((3, 5), (0, p))
            axes[1][p] = plt.subplot2grid((3, 5), (1, p))

            for k, slicename in enumerate(slices):
                
                df_ss = df_hosts[p][host]
                if len(df_ss) == 0:
                    print 'skipping', host, 'no data'
                    continue
                if len(df_ss[ df_ss['hostname'] == host ]) == 0:
                    print 'skipping', host
                    continue
                for period in times: # '08-12 to 16', '11-29 to 12-03',  set(df['period']):
                #for period in [ '02-21 to 25', '03-02 to 06' ]:
                # for period in [ '08-12 to 16','11-29 to 12-03', '02-21 to 25', '03-02 to 06']: # '08-12 to 16', '11-29 to 12-03',  set(df['period']):
                    period_str = period.strftime("%Y-%m-%d %H:%M:%S")
                    if period_str not in p2c:
                        p2c[period_str] = colors[c]
                        c += 1
                    ds = df_ss[ (df_ss['period'] == period_str) &
                                (df_ss['hostname'] == host) &
                                (df_ss['slice'] == slicename) ]

                    if len(ds) == 0:
                        continue

                    # Top
                    plt.cm.Paired.colors
                    
                    #ax = axes[0, p]
                    ax = axes[0][p]
                    r = [math.log10(x) for x in ds['rate_mbps']]
                    label = 'pdf-%s-%s (%d)' % (period_str, slicename, len(ds['rate_mbps']))
                    label2date[label] = period
                    n, bins, patches = ax.hist(r, int(math.sqrt(len(ds['rate_mbps']))),
                                               histtype='step', normed=1, label=label, ls='-',
                                               color=p2c[period_str])

                    #ax.set_xlim(1, 100)
                    ax.set_axisbelow(True)

                
                    ax.legend(fontsize='x-small', loc='upper center', bbox_to_anchor=(0.5, 1.3))
                    ax.grid(color='#dddddd')
                    ax.set_title(host)
                    labels = ['%.2f' % math.pow(10, float(l)) for l in ax.get_xticks()]
                    ax.xaxis.set_major_formatter(customFormatter)
                    #ax.semilogx()

                    # Middle
                    #ax = axes[1, p]
                    #ax = plt.subplot2grid((3, 5), (1, p))
                    ax = axes[1][p]
                    label = 'cdf-%s-%s (%d)' % (period, slicename, len(ds['rate_mbps']))
                    use_log = True
                    if use_log:
                        r = [math.log10(x) for x in ds['rate_mbps']]
                        n, bins, patches = ax.hist(r, len(ds['rate_mbps']), # ds['rate_mbps']
                                                   histtype='step', normed=1, cumulative=True, label=label, ls='-',
                                                   color=p2c[period_str])
                        ax.xaxis.set_major_formatter(customFormatter)

                    else:
                        n, bins, patches = ax.hist(ds['rate_mbps'], len(ds['rate_mbps']),
                                       histtype='step', normed=1, cumulative=True, label=label, ls='-')
                        ax.set_xlim(-1, 200)

                    #ax.semilogx()

                    ax.set_axisbelow(True)
                    #ax.legend(loc=3, fontsize='x-small') # , loc='upper center', bbox_to_anchor=(0.5, 1.1))
                    ax.grid(color='#dddddd')
                    ax.set_title(host)
                    if p != 0:
                        ax.set_yticklabels([])

    #for i, host_row in enumerate(hosts):
    #    for j, host in enumerate(host_row):
                    
            #ax = axes[2, p]
        if True:
            print 'last'
            axes[2] = plt.subplot2grid((3, 5), (2, 0), colspan=5)
            ax = axes[2]
        
            ds = df_disco[ df_disco['hostname'] == host ]
            ax.plot_date(dates.epoch2num(ds['ts']), ds['pct_discards'], ls='-', ms=0, label='switch', color='mediumpurple')
        
            ax.set_title(host)
            ax.set_ylim(-0.01, 1)
            ax.tick_params(axis='x', labelrotation=90)
            ax.grid(color='#dddddd')
            #ax.legend(loc=4, fontsize='x-small') 
 
            # Color switch regions with the PDF periods based on legend colors.
            for p in range(0, len(periods_list)):
                h, l = axes[0][p].get_legend_handles_labels()
                for k, line in enumerate(h):
                    s = label2date[l[k]]
                    e = s + datetime.timedelta(days=4)
                    color = h[k].get_edgecolor()
                    ax.axvspan(dates.date2num(s), dates.date2num(e), alpha=0.5, color=color)

            #if p != 0:
            #    ax.set_yticklabels([])
            #else:
            ax.set_ylabel('% discard timebins')  
                
            ax2 = ax.twinx() # axes[2, p]
        
            ds = df_ss_count[ df_ss_count['hostname'] == host ]
            ax2.plot_date(dates.epoch2num(ds['ts']), ds['count'], ls='-', ms=0, label='sidestream')
        
            #ax.set_title(host)
            #ax.set_ylim(-0.01, 1)
            if p != 4:
                ax2.set_yticklabels([])
            else:
                ax2.set_ylabel('Sidestream Flow Count')
            #ax2.set_xticklabels([])
            #ax.tick_params(axis='x', labelrotation=90)
            ax2.grid(color='#dddddd')
            ax.legend(loc=4, fontsize='x-small') 
            ax2.legend(loc=1, fontsize='x-small') 

     
        axes[0][0].set_ylabel('PDF')
        axes[1][0].set_ylabel('CDF')
        #axes[2, 0].set_ylabel('% discard timebins')    
        #axes[2, 0].set_ylabel('Sidestream Flow Count')    

        fig.suptitle(title) # + ('\n%s' % [period.strftime("%Y-%m-%d %H:%M:%S") for period in times]))
        plt.show()

#plt.hist2d(
#    df_ss[ (df_ss['period'] == '08-12 to 16') & (df_ss['hostname'] == 'mlab1.dfw02') & (df_ss['slice'] == 'samknows') ],
#    df_ss[ (df_ss['period'] == '02-21 to 25') & (df_ss['hostname'] == 'mlab1.dfw02') & (df_ss['slice'] == 'samknows') ],
#    bins=40,
#)
#print n, len(bins)

last
last
last
last


In [None]:
from datetime import datetime

def dt(date):
    return datetime.strptime(date, '%Y-%m-%d')

In [58]:
import matplotlib.pyplot as plt

from matplotlib import cm
from numpy import linspace

start = 0.0
stop = 1.0
number_of_lines= 6
cm_subsection = linspace(start, stop, number_of_lines) 

colors = [ cm.jet(x) for x in cm_subsection ]

for i, color in enumerate(colors):
    plt.axhline(i, color=color)

plt.ylabel('Line Number')
plt.show()

In [63]:

print                     plt.cm.Paired.colors


((0.6509803921568628, 0.807843137254902, 0.8901960784313725), (0.12156862745098039, 0.47058823529411764, 0.7058823529411765), (0.6980392156862745, 0.8745098039215686, 0.5411764705882353), (0.2, 0.6274509803921569, 0.17254901960784313), (0.984313725490196, 0.6039215686274509, 0.6), (0.8901960784313725, 0.10196078431372549, 0.10980392156862745), (0.9921568627450981, 0.7490196078431373, 0.43529411764705883), (1.0, 0.4980392156862745, 0.0), (0.792156862745098, 0.6980392156862745, 0.8392156862745098), (0.41568627450980394, 0.23921568627450981, 0.6039215686274509), (1.0, 1.0, 0.6), (0.6941176470588235, 0.34901960784313724, 0.1568627450980392))
