In [1]:
# Enables figures loading outside of browser.
# If not run, figures will load inline.
%matplotlib

import os
import math
import pandas as pd
import numpy as np
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import matplotlib.ticker
import datetime
import collections

from scipy import stats

# Depends on: pip install sklearn
from sklearn.model_selection import train_test_split

# Some matplotlib features are version dependent.
assert(matplotlib.__version__ >= '2.1.2')

# Depends on: pip install --upgrade google-cloud-bigquery
from google.cloud import bigquery

def run_query(query, project='mlab-sandbox'):
    #print query
    client = bigquery.Client(project=project)
    job = client.query(query)

    results = collections.defaultdict(list)
    for row in job.result(timeout=300):
        for key in row.keys():
            results[key].append(row.get(key))

    return pd.DataFrame(results)

def unlog(x, pos):
    """Formats the x axis for histograms taken on the log of values."""
    v = math.pow(10, x)
    frac, whole = math.modf(v)
    if frac > 0:
        return '%.1f' % v
    else:
        return '%d' % whole

logFormatter = matplotlib.ticker.FuncFormatter(unlog)

Using matplotlib backend: MacOSX


# UPLINK UTILIZATION OVER TIME

In [186]:
df_disco_pct = run_query("""
#standardSQL
SELECT

  name AS hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,  
  APPROX_QUANTILES(value, 101)[ORDINAL(50)] as bytes_50th,
  APPROX_QUANTILES(value, 101)[ORDINAL(90)] as bytes_90th

FROM (
  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab1.[a-z]{3}[0-9]{2}).*') AS name,
    sample.timestamp AS sts,
    sample.value AS value
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
    metric LIKE 'switch.octets.uplink.tx'
  GROUP BY
    hostname, metric, sts, value
)
WHERE
  name IS NOT NULL
GROUP BY
  hostname, day, ts
ORDER BY
  hostname, day, ts
""")



In [232]:
sites = [
    ['dfw', 'lga', 'nuq'],
#    ['sea', 'atl', 'den'],
#    ['mia', 'nuq', 'ord'],
]

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16, 4))
for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        if j != 0:
            axes[j].set_yticklabels([])
        if i != len(sites)-1:
            axes[j].set_xticklabels([])
        for h in set(df_disco['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_disco_pct[ (df_disco_pct['hostname'] == h) & (df_disco_pct['bytes_50th'] > 1e5) ]
                axes[j].plot_date(dates.epoch2num(ds['ts']), ds['bytes_50th'], ls='-', ms=0, label=h[6:11])
                #axes[i, j].scatter(dates.epoch2num(ds['ts']), ds['bytes_50th'], s=1, label=h[6:11])

        axes[j].set_title(site)
        axes[j].set_ylim(1e4, 1e9)
        axes[j].set_xlim(dates.epoch2num(1464739200), dates.epoch2num(1534204800))
        axes[j].tick_params(axis='x', labelrotation=90)
        axes[j].grid(color='#dddddd')
        axes[j].legend(loc=3, ncol=4, fontsize='x-small', columnspacing=1)
        axes[j].semilogy()

fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle('50th Percentile Uplink Utilization Over time')

Text(0.5,0.98,'50th Percentile Uplink Utilization Over time')

In [None]:
# DISCARDS OVER TIME

In [5]:
df_disco = run_query("""
#standardSQL
SELECT
  name AS hostname,
  FORMAT_TIMESTAMP("%Y-%m-%d", TIMESTAMP_TRUNC(sts, DAY)) AS day,
  UNIX_SECONDS(TIMESTAMP_TRUNC(sts, DAY)) AS ts,
  SUM(IF(metric = 'switch.discards.uplink.tx', value, 0)) AS total_discards
FROM (
  SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    sample.timestamp AS sts,
    sample.value AS value
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
    metric LIKE 'switch.discards.uplink.tx'
  GROUP BY
    hostname, metric, sts, value
)
WHERE
  name IS NOT NULL
GROUP BY
  hostname, day, ts
ORDER BY
  hostname, day, ts
""")



In [231]:
sites = [
    ['dfw', 'lga', 'nuq'],
#     ['lga'],
#    ['sea', 'atl', 'den'],
#    ['mia', 'nuq', 'ord'],
]

axes = [
    [None, None, None],
]

#fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16, 4))
fig = plt.figure(figsize=(16, 4))
#  axes[0][c] = plt.subplot2grid((2, cols), (0, c))

for i, site_row in enumerate(sites):
    for j, site in enumerate(site_row):
        axes[i][j] = plt.subplot2grid((1, 3), (i, j))
        if j != 0:
            axes[i][j].set_yticklabels([])
        if i != len(sites)-1:
            axes[i][j].set_xticklabels([])
        for h in set(df_disco['hostname']):
            if ('mlab1.' + site) in h:
                ds = df_disco[ (df_disco['hostname'] == h) & (df_disco['total_discards'] > 100)& (df_disco['total_discards'] < 1000000)]
                axes[i][j].plot_date(dates.epoch2num(ds['ts']), ds['total_discards'], ls='-', ms=0, label=h[6:11])

        axes[i][j].set_title(site)
        axes[i][j].set_ylim(100, 1000000)
        axes[i][j].set_xlim(dates.epoch2num(1464739200), dates.epoch2num(1534204800))
        axes[i][j].tick_params(axis='x', labelrotation=-35)
        axes[i][j].grid(color='#dddddd')
        axes[i][j].legend(loc=2, fontsize='x-small')
        axes[i][j].semilogy()
        
fig.subplots_adjust(hspace=0.3, wspace=0.4)
fig.suptitle('Discards over time')

Text(0.5,0.98,'Discards over time')

In [None]:
# One Week Performance Distributions -- Before & After

In [235]:
df_disco_dist = pd.DataFrame(query.sync_query("""
SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    CASE
        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25") THEN 'before-2w'
        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18") THEN 'after-2w'

--        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-02-11") and TIMESTAMP("2018-02-18")          THEN 'before-2w'
--        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-02-18") and TIMESTAMP("2018-02-25")          THEN 'before-1w'
--        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-03-04") and TIMESTAMP("2018-03-11")          THEN 'after-1w'
--        WHEN TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-03-11") and TIMESTAMP("2018-03-18")          THEN 'after-2w'
        ELSE 'what'
    END as period,
    TIMESTAMP_TRUNC(sample.timestamp, DAY) AS day,
    UNIX_SECONDS(sample.timestamp) AS sts,
    8 * sample.value / 10 AS bps
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
       metric LIKE 'switch.octets.uplink.tx'
    AND (
        hostname LIKE '%mlab1.lga03%' OR hostname LIKE '%mlab1.dfw02%'
    )
    AND ( -- One week - Sunday to Saturday.
          TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-02-11") and TIMESTAMP("2018-02-25")
       OR TIMESTAMP_TRUNC(sample.timestamp, DAY) BETWEEN TIMESTAMP("2018-03-04") and TIMESTAMP("2018-03-18")
    )
  GROUP BY
    hostname, metric, sample.timestamp, bps
  ORDER BY
    hostname, sts
"""))



In [236]:
print df_disco_dist.keys()
print len(df_disco_dist)



for h in set(df_disco_dist['name']):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
    for day in sorted(set(df_disco_dist['period'])):
        ds = df_disco_dist[ (df_disco_dist['name'] == h) & (df_disco_dist['period'] == day) ]
        r = [math.log10(x) for x in ds['bps']]
            
        axes.hist(r, # len(ds['bps']),
                      int(1.5 * math.sqrt(len(ds['bps']))),
                      histtype='step',
                      density=1, # cumulative=True,
                      label='cdf-'+h[6:11] + '-' + str(day), ls='-')

        axes.set_title(h[6:11])
        #axes.tick_params(axis='x', labelrotation=90)
    axes.grid(color='#dddddd')
    axes.legend(loc=2, fontsize='x-small')
    axes.xaxis.set_major_formatter(logFormatter)
    fig.subplots_adjust(hspace=0.3, wspace=0.4)
    fig.suptitle('Uplink Utilization Distribution')

Index([u'bps', u'day', u'metric', u'name', u'period', u'sts'], dtype='object')
517494


In [None]:
# One Day Performance Distributions -- before & after

Harder to notice changes wrt before  after.

In [35]:
df_disco_dist = pd.DataFrame(query.sync_query("""
SELECT
    metric,
    REGEXP_EXTRACT(hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    TIMESTAMP_TRUNC(sample.timestamp, DAY) AS day,
    UNIX_SECONDS(sample.timestamp) AS sts,
    8 * sample.value / 10 AS bps
  FROM
    `measurement-lab.base_tables.switch*`,
    UNNEST(sample) AS sample
  WHERE
       metric LIKE 'switch.octets.uplink.tx'
    AND (
        hostname LIKE '%mlab1.lga03%' OR hostname LIKE '%mlab1.dfw02%'
    )
    AND (
        TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-02-18")
     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-02-25")
     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-04")
     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-11")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-04")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-05")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-10")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-01")
--     OR TIMESTAMP_TRUNC(sample.timestamp, DAY) = TIMESTAMP("2018-03-01")
    )
  GROUP BY
    hostname, metric, sample.timestamp, bps
  ORDER BY
    hostname, sts
"""))



In [None]:
print df_disco_dist.keys()
print len(df_disco_dist)



for h in set(df_disco_dist['name']):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
    for day in sorted(set(df_disco_dist['day'])):
        ds = df_disco_dist[ (df_disco_dist['name'] == h) & (df_disco_dist['day'] == day) ]
        r = [math.log10(x) for x in ds['bps']]
            
        axes.hist(r, # len(ds['bps']),
                      int(1.5 * math.sqrt(len(ds['bps']))),
                      histtype='step',
                      density=1, # cumulative=True,
                      label='cdf-'+h[6:11] + '-' + str(day), ls='-')

        axes.set_title(h[6:11])
        #axes.tick_params(axis='x', labelrotation=90)
    axes.grid(color='#dddddd')
    axes.legend(loc=2, fontsize='x-small')
    axes.xaxis.set_major_formatter(logFormatter)
    fig.subplots_adjust(hspace=0.3, wspace=0.4)
    fig.suptitle('Uplink Utilization Distribution')

In [None]:
# NDT test distributions - Before & After

In [54]:
df_ndt_dist = run_query("""
-- ALL NDT tests before and after the flow-control configuration change.
   
SELECT
    REGEXP_EXTRACT(connection_spec.server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
    CASE
     WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") and TIMESTAMP("2018-02-18")
       THEN 'before-2w'
     WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-18") and TIMESTAMP("2018-02-25")
       THEN 'before-1w'
     WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") and TIMESTAMP("2018-03-11")
       THEN 'after-1w'
     WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") and TIMESTAMP("2018-03-18")
       THEN 'after-2w'
     ELSE 'what'
    END as period,

    web100_log_entry.snap.StartTimeStamp AS ts,
    8 * (web100_log_entry.snap.HCThruOctetsAcked /
      (web100_log_entry.snap.SndLimTimeRwin +
       web100_log_entry.snap.SndLimTimeCwnd +
       web100_log_entry.snap.SndLimTimeSnd)) as download_mbps   
FROM
   `measurement-lab.base_tables.ndt*`
   
WHERE       
    ( -- One week - Sunday to Saturday.
        TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") and TIMESTAMP("2018-02-18")
     OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-18") and TIMESTAMP("2018-02-25")
     OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") and TIMESTAMP("2018-03-11")
     OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") and TIMESTAMP("2018-03-18"))
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) >= 9000000                             
  AND (web100_log_entry.snap.SndLimTimeRwin +                                   
    web100_log_entry.snap.SndLimTimeCwnd +                                      
    web100_log_entry.snap.SndLimTimeSnd) < 600000000                            
    
GROUP BY
  name, period, ts, download_mbps
""")



In [65]:
df_ndt_dist = run_query("""
  -- Weekly cohort - NDT tests before and after the flow-control configuration change.
SELECT
  REGEXP_EXTRACT(connection_spec.server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
  CASE
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-18") THEN 'before-2w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-18") AND TIMESTAMP("2018-02-25") THEN 'before-1w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-11") THEN 'after-1w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") AND TIMESTAMP("2018-03-18") THEN 'after-2w'
    ELSE 'what'
  END AS period,
  8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd)) AS download_mbps
FROM
  `measurement-lab.base_tables.ndt*`
WHERE
  ( -- One week - Sunday to Saturday.
    TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-18")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-18") AND TIMESTAMP("2018-02-25")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-11")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11") AND TIMESTAMP("2018-03-18"))
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
  AND web100_log_entry.connection_spec.remote_ip IN( (
    SELECT
      remote_ip
    FROM (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11")
        AND TIMESTAMP("2018-02-18")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip )
    INNER JOIN (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-18")
        AND TIMESTAMP("2018-02-25")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip )
    USING
      (remote_ip)
    INNER JOIN (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04")
        AND TIMESTAMP("2018-03-11")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip )
    USING
      (remote_ip)
    INNER JOIN (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw02|lga03)")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-11")
        AND TIMESTAMP("2018-03-18")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip )
    USING
      (remote_ip)) )
GROUP BY
  name,
  period,
  download_mbps
""")



In [42]:
df_ndt_dist = run_query("""
  -- Two-week cohorts - NDT tests before and after the flow-control configuration change.
SELECT
  REGEXP_EXTRACT(connection_spec.server_hostname, r'(mlab[1-4].[a-z]{3}[0-9]{2}).*') AS name,
  CASE
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25") THEN 'before-2w'
    WHEN TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18") THEN 'after-2w'
    ELSE 'what'
  END AS period,
  --web100_log_entry.connection_spec.remote_ip AS remote_ip,
  (8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps
  --MAX(8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd))) AS download_mbps
  --APPROX_QUANTILES(8 * (web100_log_entry.snap.HCThruOctetsAcked / (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd)), 101)[ORDINAL(50)] as download_mbps
FROM
  `measurement-lab.base_tables.ndt*`
WHERE
  ( -- One week - Sunday to Saturday.
       TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
    OR TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18"))
  AND REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
  AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) >= 9000000
  AND (web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd) < 600000000
  AND connection_spec.data_direction = 1
  AND web100_log_entry.connection_spec.remote_ip IN(
    SELECT
      remote_ip
    FROM (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip, count(*) as c1
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-02-11") AND TIMESTAMP("2018-02-25")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip
    ) INNER JOIN (
      SELECT
        web100_log_entry.connection_spec.remote_ip AS remote_ip, count(*) as c2
      FROM
        `measurement-lab.base_tables.ndt*`
      WHERE
        REGEXP_CONTAINS(connection_spec.server_hostname, r"mlab1.(dfw|lga|nuq)\d\d")
        AND TIMESTAMP_TRUNC(log_time, DAY) BETWEEN TIMESTAMP("2018-03-04") AND TIMESTAMP("2018-03-18")
        AND connection_spec.data_direction = 1
      GROUP BY
        remote_ip
    ) USING (remote_ip)) 
GROUP BY
  name,
  period,
  --remote_ip
  download_mbps
""")

In [44]:
print df_ndt_dist.keys()
print len(df_ndt_dist)

def hist(vals, bin_count, log=True, cdf=False):
    """Produces hist or cdf values for smooth plots."""
    if log:
        r = [math.log10(x) for x in vals]
    else:
        r = vals
        
    m, bins = np.histogram(r, bin_count, normed=True)
    m = m.astype(float)

    tops = m
    if cdf:
        tops = np.cumsum(m)
        total = sum(m)
        tops = [float(t) / total for t in tops ]
    
    return tops, bins

seq = [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)]

for site in set([v[6:9] for v in set(df_ndt_dist['name'])]):
    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 6))
    for p, h in enumerate(sorted([h for h in set(df_ndt_dist['name']) if site in h])):
        before = None
        for day in ['before-2w', 'after-2w']:
            ds = df_ndt_dist[ (df_ndt_dist['name'] == h) & (df_ndt_dist['period'] == day) ]
            r = ds['download_mbps']
            print h, len(r)
            if not len(r):
                continue

            size = int(math.sqrt(len(r)))
            if day == 'after-2w':
                size = before
            else:
                before = size
            
                
#            tops, bins = hist(r, int(1.8 * math.sqrt(len(r))), log=True , cdf=True)
            #tops, bins = hist(r, int(math.sqrt(len(r))), log=True , cdf=True)
            print size, h, day
            tops, bins = hist(r, size, log=True , cdf=False)
#            tops, bins = hist(r, int(1.8 * math.sqrt(len(r))), log=False , cdf=True)           
#            tops, bins = hist(r, len(r), log=False , cdf=True)            
            
            #a, b = train_test_split(r, test_size=0.5)
            #z = a
            #print 'same', stats.ks_2samp(a,b)


            #tops_a, bins_a = hist(a, int(1 * math.sqrt(len(a))), log=True, cdf=True)
            #tops_b, bins_b = hist(b, int(1 * math.sqrt(len(b))), log=True, cdf=True)
            if p > len(seq)-1:
                print 'skipping', h
                continue
            i, j = seq[p]
#            print h, len(bins), len(tops)
            axes[i, j].plot(bins[:-1], tops, label='cdf-'+h[6:11] + '-' + str(day))
            #axes[i, j].plot(bins_a[:-1], tops_a, label=h[6:11] + '-' + str(day)+'-a')
            #axes[i, j].plot(bins_b[:-1], tops_b, label=h[6:11] + '-' + str(day)+'-b')
            axes[i, j].set_title(h[6:11])
            #axes[i, j].set_xlim(-10, 1000)
            axes[i, j].set_xlim(math.log10(.25), math.log10(1000))
            axes[i, j].grid(color='#dddddd')
            axes[i, j].legend(loc=4, fontsize='x-small')
            axes[i, j].set_ylim(-0.1, 1.1)
            axes[i, j].xaxis.set_major_formatter(logFormatter)
    fig.subplots_adjust(hspace=0.3, wspace=0.4)
    fig.suptitle('NDT Download Distributions')

Index([u'download_mbps', u'name', u'period'], dtype='object')
1245080
mlab1.dfw01 29573
171 mlab1.dfw01 before-2w
mlab1.dfw01 40542
171 mlab1.dfw01 after-2w
mlab1.dfw02 35691
188 mlab1.dfw02 before-2w
mlab1.dfw02 50937
188 mlab1.dfw02 after-2w
mlab1.dfw03 36493
191 mlab1.dfw03 before-2w
mlab1.dfw03 51686
191 mlab1.dfw03 after-2w
mlab1.dfw04 34892
186 mlab1.dfw04 before-2w
mlab1.dfw04 51277
186 mlab1.dfw04 after-2w
mlab1.dfw05 42975
207 mlab1.dfw05 before-2w
mlab1.dfw05 58017
207 mlab1.dfw05 after-2w
mlab1.dfw06 35763
189 mlab1.dfw06 before-2w
mlab1.dfw06 52020
189 mlab1.dfw06 after-2w
mlab1.lga02 1198
34 mlab1.lga02 before-2w
mlab1.lga02 66686
34 mlab1.lga02 after-2w
mlab1.lga03 44086
209 mlab1.lga03 before-2w
mlab1.lga03 55160
209 mlab1.lga03 after-2w
mlab1.lga04 43125
207 mlab1.lga04 before-2w
mlab1.lga04 53630
207 mlab1.lga04 after-2w
mlab1.lga05 51216
226 mlab1.lga05 before-2w
mlab1.lga05 62131
226 mlab1.lga05 after-2w
mlab1.lga06 44109
210 mlab1.lga06 before-2w
mlab1.lga06 55263
2

In [144]:
from scipy import stats

In [80]:
len(n), len(bins[:-1])
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))

axes.plot(bins[:-1], n)
fig.subplots_adjust(hspace=0.3, wspace=0.4)
print f


[<matplotlib.lines.Line2D object at 0x7fee75327390>]


In [83]:
m, bins = np.histogram(r, int(math.sqrt(len(ds['download_mbps']))))
print m, bins, len(m), len(bins)

[  1   2   9  15  19  30  27  23  32  26  22  21  30  33  32  25  38  51
  46  33  47  42  53  45  48  63  52  65  81  64  90  55  49  78  88  95
 110 101  82  91 115 115 137 111 113 107 121 118 175 162 175 211 224 228
 310 434 183 166 165 102  80 109 108 128 137 133 116 113  57  50  21  26
  18  21  19  10  27  22  20   7   8] [-0.24250399 -0.2029988  -0.16349361 -0.12398841 -0.08448322 -0.04497803
 -0.00547284  0.03403235  0.07353754  0.11304274  0.15254793  0.19205312
  0.23155831  0.2710635   0.3105687   0.35007389  0.38957908  0.42908427
  0.46858946  0.50809465  0.54759985  0.58710504  0.62661023  0.66611542
  0.70562061  0.74512581  0.784631    0.82413619  0.86364138  0.90314657
  0.94265176  0.98215696  1.02166215  1.06116734  1.10067253  1.14017772
  1.17968292  1.21918811  1.2586933   1.29819849  1.33770368  1.37720887
  1.41671407  1.45621926  1.49572445  1.53522964  1.57473483  1.61424003
  1.65374522  1.69325041  1.7327556   1.77226079  1.81176598  1.85127118
  1.89077637 