In [13]:
# Enables figures loading outside of browser.
# If not run, figures will load inline.
%matplotlib

import os
import math
import pandas as pd
import numpy as np
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import matplotlib.ticker
import datetime
import collections

from scipy import stats

# Depends on: pip install sklearn
from sklearn.model_selection import train_test_split

# Some matplotlib features are version dependent.
assert(matplotlib.__version__ >= '2.1.2')

# Depends on: pip install --upgrade google-cloud-bigquery
from google.cloud import bigquery

def run_query(query, project='mlab-sandbox'):
    #print query
    client = bigquery.Client(project=project)
    job = client.query(query)

    results = collections.defaultdict(list)
    for row in job.result(timeout=300):
        for key in row.keys():
            results[key].append(row.get(key))

    return pd.DataFrame(results)

def unlog(x, pos):
    """Formats the x axis for histograms taken on the log of values."""
    v = math.pow(10, x)
    frac, whole = math.modf(v)
    if frac > 0:
        return '%.1f' % v
    else:
        return '%d' % whole

logFormatter = matplotlib.ticker.FuncFormatter(unlog)

Using matplotlib backend: MacOSX


# UPLINK UTILIZATION OVER TIME

In [14]:
df_disco_pct = run_query("""
#standardSQL
SELECT

  name AS hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,  
  APPROX_QUANTILES(value, 101)[ORDINAL(50)] as bytes_50th,
  APPROX_QUANTILES(value, 101)[ORDINAL(90)] as bytes_90th

FROM (
  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab1.[a-z]{3}[0-9]{2}).*') AS name,
    sample.timestamp AS sts,
    sample.value AS value
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
    metric LIKE 'switch.octets.uplink.tx'
  GROUP BY
    hostname, metric, sts, value
)
WHERE
  name IS NOT NULL
GROUP BY
  hostname, day, ts
ORDER BY
  hostname, day, ts
""")

In [15]:
sites = [
    ['dfw'],
    ['lga'],
    ['nuq'],
]

axes = [
    [None],
    [None],
    [None],
]

fig = plt.figure(figsize=(6, 8))
for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        axes[i][j] = plt.subplot2grid((3, 1), (i, j))
        axes[i][j].set_ylabel('Median Uplink ' + site.upper())

        if i != len(sites)-1:
            axes[i][j].set_xticklabels([])
        for h in set(df_disco_pct['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_disco_pct[ (df_disco_pct['hostname'] == h) & (df_disco_pct['bytes_50th'] > 1e5) ]
                d = [pd.to_datetime(t, unit='s') for t in ds['ts']]
                axes[i][j].scatter(d, ds['bytes_50th'], s=1, label=h[6:11])
                
        axes[i][j].set_ylim(1e4, 1e9)
        axes[i][j].set_xlim(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01"))
        axes[i][j].tick_params(axis='x', labelrotation=-90)
        axes[i][j].grid(color='#dddddd')
        axes[i][j].legend(loc=3, ncol=7, fontsize='x-small', columnspacing=1)
        axes[i][j].semilogy()

fig.suptitle('Daily Median Uplink Utilization')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

# Packets Over Time

In [16]:
df_disco_packets = run_query("""
#standardSQL

WITH measurementlab_switch_dedup AS (

  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
    sample.timestamp AS sts,
    sample.value AS value
    
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
    
  WHERE
    metric LIKE 'switch.unicast.uplink.tx'
    
  GROUP BY
    hostname, metric, sts, value
)

SELECT
  hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,
  SUM(value) AS total
  
FROM
  measurementlab_switch_dedup
  
WHERE
  hostname IS NOT NULL
  
GROUP BY
  hostname, day, ts
  
ORDER BY
  hostname, day, ts
""")

In [17]:
sites = [
    ['dfw'],
    ['lga'],
    ['nuq'],
]

axes = [
    [None],
    [None],
    [None],
]

fig = plt.figure(figsize=(6, 8))

for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        axes[i][j] = plt.subplot2grid((3, 1), (i, j))
        axes[i][j].set_ylabel('Total Packets ' + site.upper())
        if i != len(sites)-1:
            axes[i][j].set_xticklabels([])

        for h in set(df_disco_packets['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_disco_packets[ (df_disco_packets['hostname'] == h) & (df_disco_packets['total'] > 100) ]
                d = [pd.to_datetime(t, unit='s') for t in ds['ts']]
                axes[i][j].scatter(d, ds['total'], s=1, label=h[6:11])

        axes[i][j].set_ylim(1e7, 1e10)
        axes[i][j].set_xlim(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01"))
        axes[i][j].tick_params(axis='x', labelrotation=-90)
        axes[i][j].grid(color='#dddddd')
        axes[i][j].legend(loc=3, ncol=7, fontsize='x-small')
        axes[i][j].semilogy()
        
fig.suptitle('Daily Packets')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

# DISCARDS OVER TIME

In [18]:
df_disco = run_query("""
#standardSQL

WITH measurementlab_switch_dedup AS (

  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
    sample.timestamp AS sts,
    sample.value AS value
    
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
    
  WHERE
    metric LIKE 'switch.discards.uplink.tx'
    
  GROUP BY
    hostname, metric, sts, value
)

SELECT
  hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,
  SUM(value) AS total_discards
  
FROM
  measurementlab_switch_dedup
  
WHERE
  hostname IS NOT NULL
  
GROUP BY
  hostname, day, ts
  
ORDER BY
  hostname, day, ts
""")

In [19]:
sites = [
    ['dfw'],
    ['lga'],
    ['nuq'],
]

axes = [
    [None],
    [None],
    [None],
]

fig = plt.figure(figsize=(6, 8))

for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        axes[i][j] = plt.subplot2grid((3, 1), (i, j))
        axes[i][j].set_ylabel('Total Discards ' + site.upper())
        if i != len(sites)-1:
            axes[i][j].set_xticklabels([])
        #else:
        #    axes[i][j].set_xlabel('Date')
            
        for h in set(df_disco['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_disco[ (df_disco['hostname'] == h) & (df_disco['total_discards'] > 100) ]
                d = [pd.to_datetime(t, unit='s') for t in ds['ts']]
                axes[i][j].scatter(d, ds['total_discards'], s=1, label=h[6:11])

        axes[i][j].set_ylim(100, 1000000)
        axes[i][j].set_xlim(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01"))
        axes[i][j].tick_params(axis='x', labelrotation=-90)
        axes[i][j].grid(color='#dddddd')
        axes[i][j].legend(loc=2, fontsize='x-small')
        axes[i][j].semilogy()
        
fig.suptitle('Daily Packet Discards')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])


# Daily DISCO discard ratios

In [20]:
df_disco_ratio = run_query("""
WITH measurementlab_switch_dedup AS (
  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
    sample.timestamp AS sts,
    sample.value AS value
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
    (metric LIKE 'switch.discards.uplink.tx' OR metric LIKE 'switch.unicast.uplink.tx')
    AND (hostname LIKE '%lga%' OR hostname LIKE '%dfw%' OR hostname LIKE '%nuq%')
  GROUP BY
    hostname, metric, sts, value
)

SELECT
  hostname,
  day,
  ts,
  IF(total > 0, discards / total, 0) as ratio
FROM (
SELECT
  hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,
  SUM(IF(metric = "switch.discards.uplink.tx", value, 0)) AS discards,
  SUM(IF(metric = "switch.unicast.uplink.tx", value, 0)) AS total
FROM
  measurementlab_switch_dedup
WHERE
  hostname IS NOT NULL
GROUP BY
  hostname, day, ts
HAVING
  discards < total
ORDER BY
  hostname, day, ts
)
GROUP BY
  hostname, day, ts, ratio
HAVING
  ratio < 0.01
ORDER BY
  hostname, day, ts
""")

In [21]:
sites = [
    ['dfw'],
    ['lga'],
    ['nuq'],
]

axes = [
    [None],
    [None],
    [None],
]

fig = plt.figure(figsize=(6, 8))

for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        axes[i][j] = plt.subplot2grid((3, 1), (i, j))
        axes[i][j].set_ylabel('Discard Ratio ' + site.upper())
        if i != len(sites)-1:
            axes[i][j].set_xticklabels([])
            
        for h in set(df_disco_ratio['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_disco_ratio[ (df_disco_ratio['hostname'] == h) ]
                d = [pd.to_datetime(t, unit='s') for t in ds['ts']]
                axes[i][j].scatter(d, ds['ratio'], s=1, label=h[6:11])

        axes[i][j].set_ylim(1e-6, 1e-2)
        axes[i][j].set_xlim(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01"))
        axes[i][j].tick_params(axis='x', labelrotation=-90)
        axes[i][j].grid(color='#dddddd')
        axes[i][j].legend(loc=2, fontsize='x-small')
        axes[i][j].semilogy()
        
fig.suptitle('Daily Packet Loss Ratios')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

# NDT Median Download Rates

In [22]:
df_ndt_median = run_query("""
WITH mlab_ndt AS (
  SELECT
    connection_spec.server_hostname as server_hostname,
    log_time,
    (8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps

  FROM
  
    `measurement-lab.base_tables.ndt*`
  WHERE

        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
    AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
    AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
    AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
    AND connection_spec.data_direction = 1
  
  GROUP BY
    server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip,
    web100_log_entry.connection_spec.local_ip,
    web100_log_entry.connection_spec.remote_port,
    web100_log_entry.connection_spec.local_port,
    download_mbps
)
    
SELECT
  TIMESTAMP_TRUNC(log_time, DAY) as day,
  REGEXP_EXTRACT(server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
  APPROX_QUANTILES(download_mbps, 101)[ORDINAL(98)] as download_mbps

FROM
  mlab_ndt

GROUP BY
  day,
  hostname

ORDER BY
  day
""")

In [23]:
sites = [
    ['dfw'],
    ['lga'],
    ['nuq'],
]

axes = [
    [None],
    [None],
    [None],
]

print len(df_ndt_median)

fig = plt.figure(figsize=(6, 8))

for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        axes[i][j] = plt.subplot2grid((3, 1), (i, j))
        axes[i][j].set_ylabel('Median Downloads ' + site.upper())
        if i != len(sites)-1:
            axes[i][j].set_xticklabels([])
            
        for h in set(df_ndt_median['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_ndt_median[ (df_ndt_median['hostname'] == h) ]
                d = [pd.to_datetime(t) for t in ds['day']]
                axes[i][j].scatter(d, ds['download_mbps'], s=1, label=h[6:11])

        #axes[i][j].set_ylim(100, 1000000)
        axes[i][j].set_ylim(10, 1000)
        axes[i][j].set_xlim(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01"))
        axes[i][j].tick_params(axis='x', labelrotation=-90)
        axes[i][j].grid(color='#dddddd')
        axes[i][j].legend(loc=2, fontsize='x-small')
        axes[i][j].semilogy()
        
fig.suptitle('Median NDT Downloads')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

8015


# NDT Segs Retrans

In [24]:
# NOT ENOUGH HISTORICAL NDT DATA TO GET FULL TIMELINE.

df_ndt_retrans = run_query("""
#standardSQL

SELECT
  TIMESTAMP_TRUNC(log_time, DAY) AS day,
  REGEXP_EXTRACT(connection_spec.server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
  APPROX_QUANTILES(web100_log_entry.snap.SegsRetrans / web100_log_entry.snap.SegsOut, 101)[ORDINAL(50)] AS median_ratio
FROM
  `measurement-lab.base_tables.ndt*`
WHERE
  web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
  AND connection_spec.data_direction = 1
  AND log_time >= TIMESTAMP("2016-06-01")
GROUP BY
  day,
  hostname
ORDER BY
  day
""")

In [25]:
sites = [
    ['dfw'],
    ['lga'],
    ['nuq'],
]

axes = [
    [None],
    [None],
    [None],
]

print len(df_ndt_retrans)

fig = plt.figure(figsize=(6, 8))

for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        axes[i][j] = plt.subplot2grid((3, 1), (i, j))
        axes[i][j].set_ylabel('SegsRetran Ratio ' + site.upper())
        if i != len(sites)-1:
            axes[i][j].set_xticklabels([])
            
        for h in set(df_ndt_retrans['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_ndt_retrans[ (df_ndt_retrans['hostname'] == h) ]
                d = [pd.to_datetime(t) for t in ds['day']]
                axes[i][j].scatter(d, ds['median_ratio'], s=1, label=h[6:11])

        #axes[i][j].set_ylim(100, 1000000)
        axes[i][j].set_ylim(1e-6, 1e-2)
        axes[i][j].set_xlim(pd.to_datetime("2016-05-31"), pd.to_datetime("2018-08-01"))
        axes[i][j].tick_params(axis='x', labelrotation=-90)
        axes[i][j].grid(color='#dddddd')
        axes[i][j].legend(loc=2, fontsize='x-small')
        axes[i][j].semilogy()
        
fig.suptitle('Median NDT SegsRetran/SegsOut')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

8015


# One Week Performance Distributions -- Before & After

In [28]:
df_disco_dist = run_query("""
SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    CASE
        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25") THEN 'before-2w'
        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18") THEN 'after-2w'

--        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-02-11") and TIMESTAMP("2018-02-18")          THEN 'before-2w'
--        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-02-18") and TIMESTAMP("2018-02-25")          THEN 'before-1w'
--        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-03-04") and TIMESTAMP("2018-03-11")          THEN 'after-1w'
--        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-03-11") and TIMESTAMP("2018-03-18")          THEN 'after-2w'
        ELSE 'what'
    END as period,
    TIMESTAMP_TRUNC(sample.timestamp, DAY) AS day,
    UNIX_SECONDS(sample.timestamp) AS sts,
    8 * sample.value / 10 AS bps
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
       metric LIKE 'switch.octets.uplink.tx'
    AND (
        hostname LIKE '%mlab1.lga03%' OR hostname LIKE '%mlab1.dfw02%'
    )
    AND ( -- One week - Sunday to Saturday.
          TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-02-11") and TIMESTAMP("2018-02-25")
       OR TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-03-04") and TIMESTAMP("2018-03-18")
    )
  GROUP BY
    hostname, metric, sample.timestamp, bps
  ORDER BY
    hostname, sts
""")

In [29]:
print df_disco_dist.keys()
print len(df_disco_dist)



for h in set(df_disco_dist['name']):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
    for day in sorted(set(df_disco_dist['period'])):
        ds = df_disco_dist[ (df_disco_dist['name'] == h) & (df_disco_dist['period'] == day) ]
        r = [math.log10(x) for x in ds['bps']]
            
        axes.hist(r, # len(ds['bps']),
                      int(1.5 * math.sqrt(len(ds['bps']))),
                      histtype='step',
                      density=1, # cumulative=True,
                      label='cdf-'+h[6:11] + '-' + str(day), ls='-')

        axes.set_title(h[6:11])
        #axes.tick_params(axis='x', labelrotation=90)
    axes.grid(color='#dddddd')
    axes.legend(loc=2, fontsize='x-small')
    axes.xaxis.set_major_formatter(logFormatter)
    fig.subplots_adjust(hspace=0.3, wspace=0.4)
    fig.suptitle('Uplink Utilization Distribution')

Index([u'bps', u'day', u'metric', u'name', u'period', u'sts'], dtype='object')
517494


# One Day Performance Distributions -- before & after

Harder to notice changes wrt before  after.

In [31]:
df_disco_dist = run_query("""
SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    TIMESTAMP_TRUNC(sample.timestamp, DAY) AS day,
    UNIX_SECONDS(sample.timestamp) AS sts,
    8 * sample.value / 10 AS bps
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
       metric LIKE 'switch.octets.uplink.tx'
    AND (
        hostname LIKE '%mlab1.lga03%' OR hostname LIKE '%mlab1.dfw02%'
    )
    AND (
        TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-02-18")
     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-02-25")
     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-04")
     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-11")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-04")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-05")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-10")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-01")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-01")
    )
  GROUP BY
    hostname, metric, sample.timestamp, bps
  ORDER BY
    hostname, sts
""")

In [32]:
print df_disco_dist.keys()
print len(df_disco_dist)



for h in set(df_disco_dist['name']):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
    for day in sorted(set(df_disco_dist['day'])):
        ds = df_disco_dist[ (df_disco_dist['name'] == h) & (df_disco_dist['day'] == day) ]
        r = [math.log10(x) for x in ds['bps']]
            
        axes.hist(r, # len(ds['bps']),
                      int(1.5 * math.sqrt(len(ds['bps']))),
                      histtype='step',
                      density=1, # cumulative=True,
                      label='cdf-'+h[6:11] + '-' + str(day), ls='-')

        axes.set_title(h[6:11])
        #axes.tick_params(axis='x', labelrotation=90)
    axes.grid(color='#dddddd')
    axes.legend(loc=2, fontsize='x-small')
    axes.xaxis.set_major_formatter(logFormatter)
    fig.subplots_adjust(hspace=0.3, wspace=0.4)
    fig.suptitle('Uplink Utilization Distribution')

Index([u'bps', u'day', u'metric', u'name', u'sts'], dtype='object')
69120


# NDT test distributions - Before & After

In [34]:
df_ndt_dist = run_query("""
-- ALL NDT tests before and after the flow-control configuration change.
   
SELECT
    REGEXP_EXTRACT(connection_spec.server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    CASE
     WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") and TIMESTAMP("2018-02-18")
       THEN 'before-2w'
     WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-18") and TIMESTAMP("2018-02-25")
       THEN 'before-1w'
     WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") and TIMESTAMP("2018-03-11")
       THEN 'after-1w'
     WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") and TIMESTAMP("2018-03-18")
       THEN 'after-2w'
     ELSE 'what'
    END as period,

    web100_log_entry.snap.StartTimeStamp AS ts,
    8 * (web100_log_entry.snap.HCThruOctetsAcked /
      (web100_log_entry.snap.SndLimTimeRwin +
       web100_log_entry.snap.SndLimTimeCwnd +
       web100_log_entry.snap.SndLimTimeSnd)) as download_mbps   
FROM
   `measurement-lab.base_tables.ndt*`
   
WHERE       
    ( -- One week - Sunday to Saturday.
        TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") and TIMESTAMP("2018-02-18")
     OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-18") and TIMESTAMP("2018-02-25")
     OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") and TIMESTAMP("2018-03-11")
     OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") and TIMESTAMP("2018-03-18"))
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
    
GROUP BY
  name, period, ts, download_mbps
""")

In [35]:
df_ndt_dist = run_query("""
  -- Weekly cohort - NDT tests before and after the flow-control configuration change.
SELECT
  REGEXP_EXTRACT(connection_spec.server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
  CASE
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-18") THEN 'before-2w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-18") AND TIMESTAMP("2018-02-25") THEN 'before-1w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-11") THEN 'after-1w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") AND TIMESTAMP("2018-03-18") THEN 'after-2w'
    ELSE 'what'
  END AS period,
  8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd)) AS download_mbps
FROM
  `measurement-lab.base_tables.ndt*`
WHERE
  ( -- One week - Sunday to Saturday.
    TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-18")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-18") AND TIMESTAMP("2018-02-25")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-11")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") AND TIMESTAMP("2018-03-18"))
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
  AND web100_log_entry.connection_spec.remote_ip IN( (
    SELECT
      remote_ip
    FROM (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11")
        AND TIMESTAMP("2018-02-18")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip )
    INNER JOIN (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-18")
        AND TIMESTAMP("2018-02-25")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip )
    USING
      (remote_ip)
    INNER JOIN (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04")
        AND TIMESTAMP("2018-03-11")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip )
    USING
      (remote_ip)
    INNER JOIN (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11")
        AND TIMESTAMP("2018-03-18")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip )
    USING
      (remote_ip)) )
GROUP BY
  name,
  period,
  download_mbps
""")

In [None]:
df_ndt_dist = run_query("""
  -- Two-week cohorts - NDT tests before and after the flow-control configuration change.
SELECT
  REGEXP_EXTRACT(connection_spec.server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
  CASE
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25") THEN 'before-2w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18") THEN 'after-2w'
    ELSE 'what'
  END AS period,
  --web100_log_entry.connection_spec.remote_ip AS remote_ip,
  (8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps
  --MAX(8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps
  --APPROX_QUANTILES(8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd)), 101)[ORDINAL(50)] as download_mbps
FROM
  `measurement-lab.base_tables.ndt*`
WHERE
  ( -- One week - Sunday to Saturday.
       TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18"))
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
  AND connection_spec.data_direction = 1
  AND web100_log_entry.connection_spec.remote_ip IN(
    SELECT
      remote_ip
    FROM (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip, count(*) as c1
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip
    ) INNER JOIN (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip, count(*) as c2
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip
    ) USING (remote_ip)) 
GROUP BY
  name,
  period,
  --remote_ip
  download_mbps
""")

In [38]:
df_ndt_dist = run_query("""
WITH mlab_ndt AS (
  SELECT
    connection_spec.server_hostname as server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip AS remote_ip,
    (8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps
  FROM
    `measurement-lab.base_tables.ndt*`
  WHERE

  (    TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18"))
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
  AND connection_spec.data_direction = 1
  
  GROUP BY
    server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip,
    web100_log_entry.connection_spec.local_ip,
    web100_log_entry.connection_spec.remote_port,
    web100_log_entry.connection_spec.local_port,
    download_mbps)
    
SELECT
  REGEXP_EXTRACT(server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
  CASE
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25") THEN 'before-2w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18") THEN 'after-2w'
    ELSE 'what'
  END AS period,
  --web100_log_entry.connection_spec.remote_ip AS remote_ip,
  --(8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps
  MAX(download_mbps) AS download_mbps
  --APPROX_QUANTILES(download_mbps, 101)[ORDINAL(50)] as download_mbps

FROM
  mlab_ndt
WHERE
  remote_ip IN(
    SELECT
      remote_ip
    FROM (
      SELECT
        remote_ip, count(*) as c1
      FROM
        mlab_ndt
      WHERE
        TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
      GROUP BY
        remote_ip
      HAVING c1 > 10
    ) INNER JOIN (
      SELECT
        remote_ip AS remote_ip, count(*) as c2
      FROM
        mlab_ndt
      WHERE
        TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18")
      GROUP BY
        remote_ip
      HAVING c2 > 10
    ) USING (remote_ip)) 
GROUP BY
  name,
  period,
  remote_ip
  --download_mbps
""")

In [39]:
print df_ndt_dist.keys()
print len(df_ndt_dist)

def hist(vals, bin_count, log=True, cdf=False):
    """Produces hist or cdf values for smooth plots."""
    if log:
        r = [math.log10(x) for x in vals]
    else:
        r = vals
        
    m, bins = np.histogram(r, bin_count, normed=True)
    m = m.astype(float)

    tops = m
    if cdf:
        tops = np.cumsum(m)
        total = sum(m)
        tops = [float(t) / total for t in tops ]
    
    return tops, bins

seq = [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)]

for site in set([v[6:9] for v in set(df_ndt_dist['name'])]):
    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 6))
    for p, h in enumerate(sorted([h for h in set(df_ndt_dist['name']) if site in h])):
        before = None
        for day in ['before-2w', 'after-2w']:
            ds = df_ndt_dist[ (df_ndt_dist['name'] == h) & (df_ndt_dist['period'] == day) ]
            r = ds['download_mbps']
            print h, len(r)
            if not len(r):
                continue

            size = int(math.sqrt(len(r)))
            if day == 'after-2w':
                size = before
            else:
                before = size
            
                
#            tops, bins = hist(r, int(1.8 * math.sqrt(len(r))), log=True , cdf=True)
            #tops, bins = hist(r, int(math.sqrt(len(r))), log=True , cdf=True)
            print size, h, day
            tops, bins = hist(r, size, log=True , cdf=True)
#            tops, bins = hist(r, int(1.8 * math.sqrt(len(r))), log=False , cdf=True)           
#            tops, bins = hist(r, len(r), log=False , cdf=True)            
            
            #a, b = train_test_split(r, test_size=0.5)
            #z = a
            #print 'same', stats.ks_2samp(a,b)


            #tops_a, bins_a = hist(a, int(1 * math.sqrt(len(a))), log=True, cdf=True)
            #tops_b, bins_b = hist(b, int(1 * math.sqrt(len(b))), log=True, cdf=True)
            if p > len(seq)-1:
                print 'skipping', h
                continue
            i, j = seq[p]
#            print h, len(bins), len(tops)
            axes[i, j].plot(bins[:-1], tops, label='cdf-'+h[6:11] + '-' + str(day))
            #axes[i, j].plot(bins_a[:-1], tops_a, label=h[6:11] + '-' + str(day)+'-a')
            #axes[i, j].plot(bins_b[:-1], tops_b, label=h[6:11] + '-' + str(day)+'-b')
            axes[i, j].set_title(h[6:11])
            #axes[i, j].set_xlim(-10, 1000)
            axes[i, j].set_xlim(math.log10(.25), math.log10(1000))
            axes[i, j].grid(color='#dddddd')
            axes[i, j].legend(loc=4, fontsize='x-small')
            axes[i, j].set_ylim(-0.1, 1.1)
            axes[i, j].xaxis.set_major_formatter(logFormatter)
    fig.subplots_adjust(hspace=0.3, wspace=0.4)
    fig.suptitle('NDT Download Distributions')

Index([u'download_mbps', u'name', u'period'], dtype='object')
48349
mlab1.dfw01 1561
39 mlab1.dfw01 before-2w
mlab1.dfw01 1549
39 mlab1.dfw01 after-2w
mlab1.dfw02 1542
39 mlab1.dfw02 before-2w
mlab1.dfw02 1536
39 mlab1.dfw02 after-2w
mlab1.dfw03 1541
39 mlab1.dfw03 before-2w
mlab1.dfw03 1554
39 mlab1.dfw03 after-2w
mlab1.dfw04 1539
39 mlab1.dfw04 before-2w
mlab1.dfw04 1563
39 mlab1.dfw04 after-2w
mlab1.dfw05 1959
44 mlab1.dfw05 before-2w
mlab1.dfw05 1966
44 mlab1.dfw05 after-2w
mlab1.dfw06 1547
39 mlab1.dfw06 before-2w
mlab1.dfw06 1554
39 mlab1.dfw06 after-2w
mlab1.lga02 543
23 mlab1.lga02 before-2w
mlab1.lga02 2369
23 mlab1.lga02 after-2w
mlab1.lga03 1901
43 mlab1.lga03 before-2w
mlab1.lga03 1868
43 mlab1.lga03 after-2w
mlab1.lga04 1894
43 mlab1.lga04 before-2w
mlab1.lga04 1876
43 mlab1.lga04 after-2w
mlab1.lga05 2267
47 mlab1.lga05 before-2w
mlab1.lga05 2255
47 mlab1.lga05 after-2w
mlab1.lga06 1908
43 mlab1.lga06 before-2w
mlab1.lga06 1891
43 mlab1.lga06 after-2w
mlab1.lga07 1815
42 

In [None]:
#from scipy import stats

In [None]:
#len(n), len(bins[:-1])
#fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))

#axes.plot(bins[:-1], n)
#fig.subplots_adjust(hspace=0.3, wspace=0.4)
#print f


In [None]:
#m, bins = np.histogram(r, int(math.sqrt(len(ds['download_mbps']))))
#print m, bins, len(m), len(bins)

In [64]:
df_ndt_variance = run_query("""
WITH mlab_ndt AS (
  SELECT
    connection_spec.server_hostname as server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip AS remote_ip,
    (8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps
  FROM
    `measurement-lab.base_tables.ndt*`
  WHERE

  (    TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18"))
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
  AND connection_spec.data_direction = 1
  
  GROUP BY
    server_hostname,
    log_time,
    web100_log_entry.connection_spec.remote_ip,
    web100_log_entry.connection_spec.local_ip,
    web100_log_entry.connection_spec.remote_port,
    web100_log_entry.connection_spec.local_port,
    download_mbps)


SELECT
  REGEXP_EXTRACT(server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS hostname,
  CASE
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25") THEN 'before-2w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18") THEN 'after-2w'
    ELSE 'what'
  END AS period,
  remote_ip,
  STDDEV(download_mbps) AS download_stddev

FROM
  mlab_ndt
WHERE
  remote_ip IN(
    SELECT
      remote_ip
    FROM (
      SELECT
        remote_ip, count(*) as c1
      FROM
        mlab_ndt
      WHERE
        TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
      GROUP BY
        remote_ip
      HAVING c1 > 10
    ) INNER JOIN (
      SELECT
        remote_ip AS remote_ip, count(*) as c2
      FROM
        mlab_ndt
      WHERE
        TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18")
      GROUP BY
        remote_ip
      HAVING c2 > 10
    ) USING (remote_ip)) 
GROUP BY
  hostname,
  period,
  remote_ip
  --download_mbps

HAVING download_stddev is not NULL
""")

In [42]:
print len(df_ndt_variance)

48349


In [69]:
seq = [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)]

for site in set([v[6:9] for v in set(df_ndt_variance['hostname'])]):
    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 6))
    for p, h in enumerate(sorted([h for h in set(df_ndt_variance['hostname']) if site in h])):
        before = None
        for day in ['before-2w', 'after-2w']:
            ds = df_ndt_variance[ (df_ndt_variance['hostname'] == h) & (df_ndt_variance['period'] == day) ]
            r = ds['download_stddev']
            print h, len(r)
            if not len(r):
                continue

            size = int(math.sqrt(len(r)))
            if day == 'after-2w':
                size = before
            else:
                before = size

            tops, bins = hist(r, size, log=True, cdf=True)
            if p > len(seq)-1:
                print 'skipping', h
                continue
            i, j = seq[p]

            axes[i, j].plot(bins[:-1], tops, label='cdf-'+h[6:11] + '-' + str(day))
    
            axes[i, j].set_title(h[6:11])
            #axes[i, j].set_xlim(-10, 1000)
            #axes[i, j].set_xlim(math.log10(.25), math.log10(1000))
            axes[i, j].grid(color='#dddddd')
            axes[i, j].legend(loc=2, fontsize='x-small')
            #axes[i, j].set_ylim(-0.1, 1.1)
            axes[i, j].xaxis.set_major_formatter(logFormatter)
    fig.subplots_adjust(hspace=0.3, wspace=0.4)
    fig.suptitle('NDT Download Distributions')

mlab1.dfw01 1412
mlab1.dfw01 1416
mlab1.dfw02 1444
mlab1.dfw02 1450
mlab1.dfw03 1408
mlab1.dfw03 1438
mlab1.dfw04 1421
mlab1.dfw04 1450
mlab1.dfw05 1851
mlab1.dfw05 1855
mlab1.dfw06 1431
mlab1.dfw06 1449
mlab1.lga02 291
mlab1.lga02 2230
mlab1.lga03 1786
mlab1.lga03 1717
mlab1.lga04 1764
mlab1.lga04 1717
mlab1.lga05 2177
mlab1.lga05 2098
mlab1.lga06 1805
mlab1.lga06 1740
mlab1.lga07 1701
mlab1.lga07 1646
mlab1.nuq02 645
mlab1.nuq02 644
mlab1.nuq03 558
mlab1.nuq03 629
mlab1.nuq04 562
mlab1.nuq04 560
mlab1.nuq05 627
mlab1.nuq05 638
mlab1.nuq06 641
mlab1.nuq06 641


In [67]:
#print r