### Generate SQL query string, then read Google BigQuery to access M-Lab data and get ISP name using mlabnetdb

Created by John Burt, for allTBD group.

This notebook is a demo showing how to generate a query based on Kinga's R based query script to access M-Lab data. 



In [42]:
import pandas as pd
import numpy as np
import datetime
import calendar

# This function takes as input mlab_location, AS number, 
# start_time, end_time and the optional country (the default 
# country is set to US)
# The choices for the metric are: "dtp", "rtt", and "prt" for download 
# throughput, round trip time and packet retransmission respectively
# The start_time, end_time info should be entered in the 'yyyy-mm-dd' format
# The output of the function, when successful, is a string

# input parameters:
#   area_codes = list of 3 digit client area codes to search
#   metrics = list of metric types to collect ("dtp", "rtt", "prt"). Each record
#             must contain a valid value for the metrics passed, so entering multiple
#             metrics may reduce the number of records acquired.
#   start_time = earliest date for acquired data, format 'yyyy-mm-dd' 
#   end_time = latest date for acquired data, format 'yyyy-mm-dd' 
#   country = country to search (only tested with 'US')

# output:
#   query string to pass to google big query
        

def query_writer_by_area_code(area_codes, metrics, start_time, end_time, country = 'US' ): 
      
    #DEFINING THE BASIC QUERIES FOR EACH METRIC

    #The basic query for download throughput
    basic = ("#standardSQL"
        "\nSELECT "
        "\n  web100_log_entry.log_time AS log_time, "
        "\n  connection_spec.client_geolocation.city  AS client_city,  "
        "\n  connection_spec.client_geolocation.area_code AS client_area_code,  "
        "\n  web100_log_entry.connection_spec.remote_ip AS client_ip, "
        "\n  web100_log_entry.connection_spec.local_ip AS MLab_ip, "
        "\n  connection_spec.client_geolocation.latitude AS client_latitude, "
        "\n  connection_spec.client_geolocation.longitude AS client_longitude, "
        "\n  connection_spec.client_geolocation.postal_code AS client_postal_code, "
        "\n  connection_spec.server_geolocation.latitude AS MLab_latitude, "
        "\n  connection_spec.server_geolocation.longitude AS MLab_longitude, "
        "\n  connection_spec.server_geolocation.postal_code AS MLab_postal_code, "
              )
    
    metrics_str1 = []
    metrics_str2 = []
    
    if type(metrics) == str:
        metrics = [metrics]
        
    for metric in metrics:
        if "dtp" in metric:
            metrics_str1.append("\n  8 * (web100_log_entry.snap.HCThruOctetsAcked / "
                "\n      (web100_log_entry.snap.SndLimTimeRwin + "
                "\n       web100_log_entry.snap.SndLimTimeCwnd + "
                "\n       web100_log_entry.snap.SndLimTimeSnd)) AS download_Mbps")
            metrics_str2.append("\n  AND web100_log_entry.snap.CongSignals > 0")
            
        if "rtt" in metric:
            metrics_str1.append("\n  web100_log_entry.snap.MinRTT AS min_rtt" )
            metrics_str2.append("\n  AND web100_log_entry.snap.CountRTT > 10")
            
        if "prt" in metric:
            metrics_str1.append("\n  (web100_log_entry.snap.SegsRetrans / web100_log_entry.snap.DataSegsOut) AS packet_retransmission_rate")
            metrics_str2.append("\n  AND web100_log_entry.snap.DataSegsOut > 0")
    
    for s in metrics_str1[:-1]:
        basic += s + ","
    basic += metrics_str1[-1]
    
    basic += ("\nFROM "
        "\n  `measurement-lab.release.ndt_all` "
        "\nWHERE "
        "\n  connection_spec.data_direction = 1 ")    

    for s in metrics_str2:
        basic += s
    
    # area code condition area_codes
    ac_var = "connection_spec.client_geolocation.area_code"
    ac_cond = "\n  AND ("
    for ac in area_codes[:-1]:
        ac_cond += ac_var + "=" + ac + " OR "
    ac_cond += ac_var + "=" + str(area_codes[-1:][0]) + ")"
    #print(ac_cond)
    
    #WRITING THE TIME CONDITION
    tstamp_var = "partition_date "
    tframe_cond = ('\n  AND ' + tstamp_var + '> "' + start_time +
        '"\n  AND ' + tstamp_var + '< "' + end_time + '"')
    #print(tframe_cond)

    #WRITING THE COUNTRY CONDITION
    country_string = "'" + country + "'" 
    country_var = "connection_spec.client_geolocation.country_code"
    country_cond = "\n  AND " + country_var + "=" + country_string
    #print(country_cond)

    #WRITING THE QUERY
    the_query = basic + ac_cond + tframe_cond
    #with open("querypy.txt", "w") as text_file:
    #    text_file.write(the_query)

    return the_query

In [43]:
# # test the query_writer output:
# acs = ['201', '202', '203', '204', '205', '206']
# metrics = ["dtp", "rtt", "prt"]
# the_query = query_writer_by_area_code(acs, metrics, "2010-01-01","2019-01-01")
# print(the_query)

# metrics = ["rtt", "prt"]
# the_query = query_writer_by_area_code(acs, metrics, "2010-01-01","2019-01-01")
# print(the_query)

# metrics = ["prt"]
# the_query = query_writer_by_area_code(acs, metrics, "2010-01-01","2019-01-01")
# print(the_query)



In [44]:
# function to acquire m-lab data into a pandas dataframe,   
#  then acquire and add ISP name and ASN as df columns

from pandas.io import gbq
import importlib
# add alltbd path so we can import mlabnetdb from there
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import mlabnetdb
#importlib.reload(mlabnetdb) 
from mlabnetdb import *

def acquire_mlab_data_by_area_code(project_id, areacodes, metrics, start_time, end_time, country = 'US'):

    df = []
    
    # generate the query
    print("  getting mlab data from GBQ.....")
    querystring = query_writer_by_area_code(areacodes, metrics, start_time, end_time, country)
#     print(querystring)
    
    # read the query output into a pandas dataframe
    #   NOTE: the first time this runs, you will be prompted for an authorization key. 
    #    Click on the link provided, get the key string, paste it in, and go.
    df = gbq.read_gbq(querystring, project_id=project_id, verbose=True, dialect='standard')

    # use mlabnetdb to get ISP names
    print("  getting ISP names.....")
    access_owner = []
    access_ispname = []
    access_asn = []
    access_iperrors = []
    transit_owner = []
    transit_ispname = []
    transit_asn = []
    transit_iperrors = []
    for access_ip, transit_ip in zip(df.client_ip, df.MLab_ip):
        try:
            access_ipinfo = lookup(access_ip, date=None)
            transit_ipinfo = lookup(transit_ip, date=None)
        except:
            print("    error: for ip %s, lookup error"%(access_ip,transit_ip))
            access_ipinfo = []
            transit_ipinfo = []
            
        if access_ipinfo:
            access_owner.append(access_ipinfo['autonomous_system_organization'])
            access_asn.append(access_ipinfo['autonomous_system_number'])
            access_ispname.append(access_ipinfo['isp'])

            transit_owner.append(transit_ipinfo['autonomous_system_organization'])
            transit_asn.append(transit_ipinfo['autonomous_system_number'])
            transit_ispname.append(transit_ipinfo['isp'])
        else:
#             print("    error: for ip %s, ipinfo==None"%(ip))
            access_owner.append('')
            access_asn.append(0)
            access_ispname.append('')
            access_iperrors.append(access_ip)
            
            transit_owner.append('')
            transit_asn.append(0)
            transit_ispname.append('')
            transit_iperrors.append(transit_ip)
            
    print("    DONE getting ISP names")
    
    if len(access_iperrors) > 0:
        print('      Could not find ISP info for ',str(len(access_iperrors)),
              ' IPs:', ', '.join(str(x) for x in np.unique(access_iperrors)))

    # add mlab search info to dataframe
    df["area_codes"] = [','.join(areacodes)] * df.shape[0]

    # add IP_owner and IP_ASN columns to the dataframe
    df["access_IP_owner"] = access_owner
    df["access_IP_ASN"] = access_asn
    # get company name from owner string
    df["access_ISP_name"] = access_ispname
    
    df["transit_IP_owner"] = transit_owner
    df["transit_IP_ASN"] = transit_asn
    # get company name from owner string
    df["transit_ISP_name"] = transit_ispname
    
    return df

Function to pickle the data

In [45]:
import pickle

def pickle_data(data, outputfilename):
    # save data
    with open(outputfilename, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("m-lab data saved to ",outputfilename)    

read and fix up area code data

In [46]:
# get a list of area codes for a given city and state
areacodelocs = pd.read_csv('areacode_latitude_longitude.csv', index_col=None, encoding='latin_1')

# truncate area code pairs to just first 3 digits
areacodelocs.areacode = list([s[:3] for s in areacodelocs.areacode])

# drop rows with NaNs in important fields
areacodelocs = areacodelocs.drop(areacodelocs[areacodelocs.city.isnull()].index)
areacodelocs = areacodelocs.drop(areacodelocs[areacodelocs.region.isnull()].index)


Entry point:

acquire mlab data or a list of cities

In [47]:
%%time
import numpy as np

# select a state to collect mlab data for, or all states
state = '' # all states
# state = 'NY'

# set date range for search filter
daterange = ["2014-01-01","2015-01-01"]

# select metrics to collect
metrics = ["dtp", "rtt", "prt"]

outputfilename = "mlab_data_mlabnetdb_2014-2015_all_states.pkl"

# this is my project ID, you will probably use a different one
project_id = 'mlab-194421'

local_acs = np.unique(areacodelocs.areacode[areacodelocs.region.str.contains(state)])
df = []
if type(df) != pd.core.frame.DataFrame:
    df = acquire_mlab_data_by_area_code(project_id, local_acs, metrics, daterange[0], daterange[1])
else:
    df = pd.concat([df, acquire_mlab_data_by_area_code(project_id, local_acs, metrics, daterange[0], daterange[1])])
print("  got %d rows"%(df.shape[0]))

print("saving data for ",state," df.shape=",df.shape)
pickle_data(df, outputfilename)


  getting mlab data from GBQ.....
Requesting query... ok.
Job ID: b8f0f740-e1cc-4e8c-8a9a-021d89846593
Query running...
  Elapsed 7.78 s. Waiting...
  Elapsed 9.01 s. Waiting...
  Elapsed 10.24 s. Waiting...
  Elapsed 11.47 s. Waiting...
  Elapsed 12.7 s. Waiting...
  Elapsed 13.92 s. Waiting...
  Elapsed 15.15 s. Waiting...
  Elapsed 16.38 s. Waiting...
  Elapsed 17.61 s. Waiting...
  Elapsed 18.84 s. Waiting...
  Elapsed 20.07 s. Waiting...
  Elapsed 21.29 s. Waiting...
  Elapsed 22.52 s. Waiting...
  Elapsed 23.75 s. Waiting...
  Elapsed 24.98 s. Waiting...
  Elapsed 26.21 s. Waiting...
  Elapsed 27.44 s. Waiting...
  Elapsed 28.67 s. Waiting...
  Elapsed 29.9 s. Waiting...
  Elapsed 31.23 s. Waiting...
  Elapsed 32.66 s. Waiting...
  Elapsed 33.89 s. Waiting...
  Elapsed 35.12 s. Waiting...
  Elapsed 36.35 s. Waiting...
  Elapsed 37.58 s. Waiting...
Query done.
Processed: 14.6 GB Billed: 14.6 GB
Standard price: $0.07 USD

Retrieving results...
Got 2324615 rows.

Total time taken 56

In [48]:
# df.head()