###  graph mlab download throughput data using bokeh

Created by John Burt, for allTBD group.


In [129]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')
# ---

#from bokeh.plotting import figure 
#from bokeh.io import output_notebook, show

import bokeh.plotting as bkplt
import bokeh.io as bkio
import bokeh.models as bkmod

import pandas as pd
import numpy as np
import datetime
import calendar

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 100 # set to -1 to see entire text

# tell Bokeh to display its plots directly into the notebook.
bkio.output_notebook()

# Read the pre-formatted mlab data frame from csv
#  (the data was acquired and saved in another notebook)
#datafile = "mlab_dtp_data.csv"
datafile = "mlab_dtp_data_mlabnetdb.csv"
df = pd.read_csv(datafile, index_col=0)

# create a datetime version of the POSIX time
df["time"] = pd.to_datetime(df.log_time,unit='s')

# sort the data by time
df = df.sort_values("time")

### filter data by NY City area code

In [130]:
# area codes we want to look at
areacodes = [212, 332, 347, 516, 631, 646, 718, 845, 914, 917, 929, 934]

# get records that have these area codes
ny_df = df[df.client_area_code.isin(areacodes)]



### function to generate stats over a specified time interval 


In [131]:
def stats_by_hour(time, y, windowhours=24):
    timeout = []
    y_n = []
    y_mean = []
    y_median = []
    y_std = []
    
    starthour = time.min()
    samples = []
    for i in range(len(y)):
        dif = (time[i] - starthour)/np.timedelta64(1, 'h')
        
        if dif < windowhours:
            samples.append(y[i])
        else:
            if (len(samples)):
                #timeout.append(starthour)
                # convert date to datetime: bokeh needs this
                timeout.append(datetime.datetime.utcfromtimestamp(starthour.astype(object)/1e9))
                y_n.append(len(samples))
                y_mean.append(np.mean(samples))
                y_median.append(np.median(samples))
                y_std.append(np.std(samples))
            starthour += np.timedelta64(windowhours, 'h')
            samples = []

    if len(samples):
        if (len(samples)):
            #timeout.append(starthour)
            # convert date to datetime: bokeh needs this
            timeout.append(datetime.datetime.utcfromtimestamp(starthour.astype(object)/1e9))
            y_n.append(len(samples))
            y_mean.append(np.mean(samples))
            y_median.append(np.median(samples))
            y_std.append(np.std(samples))

    return timeout, y_n, y_mean, y_median, y_std



### function to plot throughput graph

In [132]:

def plot_throughput(fig, x, y, y_n, y_std, 
                    label='ISP', plot_SE=True,
                    markersize=5, color='b'):

    data = {'x_values': x,
            'y_values': y,
            'datestr': list([dt.strftime("%Y-%m-%d") for dt in x]),
            'y_n': y_n,
            'y_std': y_std,
            'isp_name': [label]*len(x)}

    source = bkmod.ColumnDataSource(data=data)

    fig.xaxis.formatter = bkmod.DatetimeTickFormatter(seconds=["%b %Y"],
                                                minutes=["%b %Y"],
                                                minsec=["%b %Y"],
                                                hours=["%b %Y"],
                                                months=["%b %Y"],
                                                years=["%b %Y"],
                                                     )

#    fig.circle(x='x_values', y='y_values', source=source, size=3, color=color, legend=label, alpha=0.8)
    fig.line(x='x_values', y='y_values', source=source, line_color=color, legend=label)

    fig.legend.location = "top_left"
    fig.xaxis.axis_label = 'Test date'
    fig.yaxis.axis_label = 'Throughput'   
    
    # if not yet in fig, then add the hover tool
    if not fig.select(type=bkmod.tools.HoverTool):
        fig.add_tools(bkmod.HoverTool(
                                    tooltips=[
                                        ("date","@datestr"),
                                        ("ISP", "@isp_name"), 
                                        ("throughput", "@y_values{0.0} Mbps") ] ) )



### create the figure and plot daily median throughput for each ISP

In [133]:
names = ([
    "Time Warner",
    "Verizon",
    "Cablevision"
])

figtitle = "Daily median throughput for three ISPs in New York city over 2013"
#median_winsize = 50
windowhrs = 24

col=['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black']

fig = bkplt.figure(plot_width=1000, plot_height=500,
                tools="pan,wheel_zoom,box_zoom,box_select,lasso_select,reset",
                title=figtitle)

i=0
for name in names:
    ispdf = ny_df[ny_df.IP_owner.str.contains(name)==True]
    x, y_n, y_mean, y_median, y_std = stats_by_hour(ispdf.time.values, ispdf.download_Mbps.values, windowhrs)
    plot_throughput(fig, x, y_median, y_n, y_std, 
                        label=name, plot_SE=False,
                        markersize=5, color=col[i])

    i+=1

bkio.show(fig)
