In [2]:
from __future__ import division
%pylab inline
import pandas as pd
import numpy as np
import subprocess
import os, sys, re
from collections import defaultdict
#import matplotlib
#matplotlib.use('Agg')
#%matplotlib nbagg
#import matplotlib.pyplot as plt

Populating the interactive namespace from numpy and matplotlib


# Challenge

Write a Python program (OSX or Linux preferred) that queries to the top 500 Alexa domains/sites:
- http://www.alexa.com/topsites
- https://www.dropbox.com/s/pqsimknj77ywqbn/top-1m.csv.tar.gz?dl=0

### CDN analysis

- Filter the domains served by a CDN (e.g., Akamai)
- Rank CDN providers by number of sites
- Calculate the average response time for the index page of each site (aka Time To First Byte) per CDN provider, and rank them by speed (separate DNS resolution, TCP connection, SSL negotiation and receive time ideally);

### BGP analysis

- Determine the ASN that each of the 500 sites maps to (based on hosting IP address)
- Rank ASNs by number of sites

In [3]:
def getCDF(data):
    xdata = np.sort(data)
    ydata = [i/len(xdata) for i in range(len(xdata))]
    return xdata, ydata

# CDN Analysis

## Load Data
load top 500 alexa websites from csv

In [4]:
#sites = pd.read_csv('top-1m.csv', nrows=500, header = None, names = ['rank', 'url'])
sites = pd.read_csv('top-1m-new.csv', nrows=500, header = None, names = ['rank', 'url'])

## Get CDN for website
- using whois on IP address (via nslookup for domain)
- extract fields from DNS headers
- checking IP from multiple server locations
- downloading all objects on the website homepage

In [5]:
site = sites['url'][4]
site

'wikipedia.org'

### get IP address using host -t a <site\>

In [6]:
def getIP(site):
    # use host command to get IP for site (domain url netloc)
    # use timeout of 5.0 s for sites that are firewalled and can't be looked up
    try:
        out = subprocess.check_output(['host', '-t', 'a', site], stderr=subprocess.STDOUT, timeout=5.0).decode('UTF-8')
        # set site IP as first IP address returned by host command - using split is more efficient than regex
        site_IP = out.split('\n')[0].split(" ")[-1]
    except:
        print('host '+site+' process ran too long')
        site_IP = ''
    return site_IP

In [7]:
getIP('google.com')

'172.217.167.46'

In [8]:
sites['IP_addr'] = sites['url'].apply(getIP)

host pornhub.com process ran too long
host ok.ru process ran too long
host livejasmin.com process ran too long
host xvideos.com process ran too long
host xhamster.com process ran too long
host xnxx.com process ran too long
host chaturbate.com process ran too long
host yts.am process ran too long
host youporn.com process ran too long
host 1337x.to process ran too long
host redtube.com process ran too long
host rutracker.org process ran too long
host sex.com process ran too long


### ensure IP is valid 

In [9]:
import ipaddress

def validateIP(site_IP):
    try:
        ipaddress.ip_address(site_IP)
        return True
    except:
        return False

In [10]:
sites['isValidIP'] = sites['IP_addr'].apply(validateIP)

### find organization and email servers using whois <IPADDR\>

In [11]:
def findOrg(site_IP):
    # use whois <IPADDR> to find and parse the organization and email server of site_IP
    # return dict {Org, OrgName, Email}
    
    Org = []
    OrgName = []
    EmailServer = []
    exclude_orgs = ['AFRINIC','APNIC','ARIN','LACNIC','IANA']
    
    try:
        out2 = subprocess.check_output(['whois', site_IP], stderr=subprocess.STDOUT, timeout=5.0).decode('UTF-8')

        for line in out2.split('\n'):
            lower_line = line.lower()

            if 'organisation:' in lower_line or 'org:' in lower_line or 'organization:' in lower_line:
                if not any([excl_org in line for excl_org in exclude_orgs]):
                    Org.append(line.split(':')[1].strip())

            if 'org-name:' in lower_line or 'orgname:' in lower_line:
                OrgName.append(line.split(':')[1].strip())

            if 'email:' in lower_line or 'e-mail:' in lower_line:
                EmailServer.append(line.split('@')[1].strip())
                #print(line)
    
    except:
        print('whois '+site_IP+' process error/ran too long')
        
    #output = {'Org': list(set(Org))[0],
    #        'OrgName': list(set(OrgName))[0],
    #        'EmailServer': list(set(EmailServer))[0] }
    output = {'Org': list(set(Org)),
            'OrgName': list(set(OrgName)),
            'EmailServer': list(set(EmailServer)) }
    
    return output

In [12]:
findOrg('8.8.8.8')

{'Org': ['Google LLC (GOGL)', 'Level 3 Parent, LLC (LPL-141)'],
 'OrgName': ['Google LLC', 'Level 3 Parent, LLC'],
 'EmailServer': ['google.com', 'level3.com']}

### separate valid IP websites and find organizations for these

In [17]:
sites_valid = sites[ sites['isValidIP'] ]
#sites_valid

In [18]:
sites_org = sites_valid['IP_addr'].apply(lambda s: pd.Series( findOrg(s) ) )

whois 186.192.90.5 process error/ran too long
whois 212.129.33.171 process error/ran too long
whois 82.165.229.87 process error/ran too long


In [19]:
sites_cdn = sites_valid.merge(sites_org, left_index=True, right_index=True )

In [21]:
#sites_cdn

### check organization, orgname, email against list of well known CDN names

In [32]:
# cdn names from wikipedia, www.cdnlist.com
CDN_names = sorted( list( pd.read_csv('CDNnames.csv', sep = '\n', header = None, names = ['CDN'])['CDN'].unique() ))

## Rank number of websites per CDN
- sort dataframe
- bar chart

## Request Index page
- time to first byte
- time to DNS resolution
- time to TCP connection
- time to SSL negotiation
- receive time