In [12]:
from __future__ import division
%pylab inline
import pandas as pd
import numpy as np
import subprocess
import os, sys, re
from collections import defaultdict
#import matplotlib
#matplotlib.use('Agg')
#%matplotlib nbagg
#import matplotlib.pyplot as plt

Populating the interactive namespace from numpy and matplotlib


# Challenge

Write a Python program (OSX or Linux preferred) that queries to the top 500 Alexa domains/sites:
- http://www.alexa.com/topsites
- https://www.dropbox.com/s/pqsimknj77ywqbn/top-1m.csv.tar.gz?dl=0

### CDN analysis

- Filter the domains served by a CDN (e.g., Akamai)
- Rank CDN providers by number of sites
- Calculate the average response time for the index page of each site (aka Time To First Byte) per CDN provider, and rank them by speed (separate DNS resolution, TCP connection, SSL negotiation and receive time ideally);

### BGP analysis

- Determine the ASN that each of the 500 sites maps to (based on hosting IP address)
- Rank ASNs by number of sites

In [2]:
def getCDF(data):
    xdata = np.sort(data)
    ydata = [i/len(xdata) for i in range(len(xdata))]
    return xdata, ydata

# CDN Analysis

## Load Data
load top 500 alexa websites from csv

In [139]:
sites = pd.read_csv('top-1m.csv', nrows=500, header = None, names = ['rank', 'url'])

## Get CDN for website
- using whois on IP address (via nslookup for domain)
- extract fields from DNS headers
- checking IP from multiple server locations
- downloading all objects on the website homepage

In [165]:
site = sites['url'][4]
site

'yahoo.com'

In [166]:
# use host command to get IP for site (domain url netloc)
out = subprocess.getoutput('host '+site)

# site IP is first IP address returned by host command - using split is more efficient than regex
site_IP = out.split('\n')[0].split(" ")[-1]
#site_IP = re.findall(r'\\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}',out)

In [167]:
# use whois to find Organization and OrgName of site IP address
# these correspond to CDN names if they are different from the website
out2 = subprocess.getoutput('whois '+site_IP)
#out2

In [168]:
def findOrg(out2):
    Org = []
    OrgName = []
    EmailServer = []
    
    exclude_orgs = ['AFRINIC','APNIC','ARIN','LACNIC','IANA']
    
    for line in out2.split('\n'):
        lower_line = line.lower()
        
        if 'organisation:' in lower_line or 'org:' in lower_line or 'organization:' in lower_line:
            if not any([excl_org in line for excl_org in exclude_orgs]):
                Org.append(line.split(':')[1].strip())
                
        if 'org-name:' in lower_line or 'orgname:' in lower_line:
            OrgName.append(line.split(':')[1].strip())
            
        if 'email:' in lower_line or 'e-mail:' in lower_line:
            EmailServer.append(line.split('@')[1].strip())
            #print(line)
    return list(set(Org)), list(set(OrgName)), list(set(EmailServer))

In [169]:
findOrg(out2)

(['Yahoo! Inc. (YHOO)'],
 ['Yahoo! Inc.'],
 ['cc.yahoo-inc.com', 'yahoo-inc.com'])

## Rank number of websites per CDN
- sort dataframe
- bar chart

## Request Index page
- time to first byte
- time to DNS resolution
- time to TCP connection
- time to SSL negotiation
- receive time