In [43]:
from __future__ import division
%pylab inline
import pandas as pd
import os
import json
from collections import defaultdict, Counter

Populating the interactive namespace from numpy and matplotlib


# DATA
1. Get host IP and ASN for Alexa top 500 websites: use socket and pyasn (offline routeviews) libraries
- Find CDN of each website: use list of common CDNs and their related domains
    - Compare site to well known CDN domains (such as googleusercontent.com, cloudfront.net, etc.)
    - Parse homepage 
        - Parsing static objects on homepage to find number of resources per well known CDN domain
    - Parse whois
        - 'Organization' field might have the CDN name
        - site contact email servers used might be related to certain CDN domains
- Use curl to get timings for accessing sites: save response time for at least 20 requests per site
    - -v and --trace-time flags allow us to read timing information in real time
    - curl -w flag allows us to write time since the request was issued for name lookup, connection, SSL negotiation, and data reception
    - Calculate times:
        - t_dns = time for DNS resolution (no redirects) = time_namelookup - time_redirect  
        - t_tcp = time for TCP connection (SYN/SYNACK) = time_connect - time_namelookup  
        - t_ssl = time for SSL handshake (only if https) = time_appconnect - time_connect  
        - t_fbyte = time_starttransfer
        - t_wait = time between issuing GET request and first byte received = time_starttransfer - time_pretransfer  
        - t_rx = time to receive data from first to last byte = time_total - time_starttransfer

# Analysis
- 

In [3]:
df_sites = pd.read_csv('top-1m-new.csv', nrows=500, header = None, names = ['rank', 'site'])

## 1. Get IP and ASNs
- Get IP using socket.gethostbyname method
- Get ASN using pyasn (can also be done using whois, downloading routeviews data, etc.)

In [6]:
import socket

def getIP(s):
    try:
        IP = socket.gethostbyname(s)
        # ISP blocked domains return IP 49.207.46.6, 49.207.46.24, 49.207.46.34
        if IP in ['49.207.46.6', '49.207.46.24', '49.207.46.34']:
            print("Blocked site "+s)
            return False
        else:
            return IP
    except:
        print("Error accessing site "+s)
        return False

In [5]:
df_sites['IP'] = df_sites['site'].apply(getIP)

Blocked site pornhub.com
Blocked site ok.ru
Blocked site livejasmin.com
Blocked site xvideos.com
Error accessing site googleusercontent.com
Blocked site xhamster.com
Error accessing site exosrv.com
Blocked site xnxx.com
Blocked site chaturbate.com
Blocked site yts.am
Blocked site youporn.com
Blocked site 1337x.to
Error accessing site cloudfront.net
Blocked site redtube.com
Blocked site rutracker.org
Error accessing site banvenez.com
Error accessing site bp.blogspot.com
Error accessing site exdynsrv.com
Blocked site sex.com
Error accessing site wixsite.com


In [7]:
import pyasn

asndb = pyasn.pyasn('output/ipasn_20181212.dat')  #downloaded pyasn_util_download.py --latest
print(asndb.lookup('8.8.8.8'))

def findASN(ip):
    if ip:
        return asndb.lookup(ip)[0]
    else:
        return False

(15169, '8.8.8.0/24')


In [8]:
df_sites['ASN'] = df_sites['IP'].apply(findASN)

In [10]:
df_sites.head()

Unnamed: 0,rank,site,IP,ASN
0,1,google.com,172.217.167.46,15169
1,2,youtube.com,172.217.166.238,15169
2,3,facebook.com,157.240.13.35,32934
3,4,baidu.com,123.125.115.110,4808
4,5,wikipedia.org,103.102.166.224,14907


### Separate unblocked IPs and sites for analysis
- 20 of 500 sites are unreachable or blocked by the ISP
- Querying these sites and urls returns an IP at the edge of the ISP we're connected to
- Most of these sites are porn related or known for adware/malware

In [41]:
df_valid = df_sites[df_sites['IP'] != False]

print("Number of valid sites for further analysis: %s\n" % (len(df_valid)))

print("List of blocked sites: %s" % list(df_sites[df_sites['IP'] == False]['site']))

Number of valid sites for further analysis: 480

List of blocked sites: ['pornhub.com', 'ok.ru', 'livejasmin.com', 'xvideos.com', 'googleusercontent.com', 'xhamster.com', 'exosrv.com', 'xnxx.com', 'chaturbate.com', 'yts.am', 'youporn.com', '1337x.to', 'cloudfront.net', 'redtube.com', 'rutracker.org', 'banvenez.com', 'bp.blogspot.com', 'exdynsrv.com', 'sex.com', 'wixsite.com']


In [55]:
# save site_to_IP dictionary for valid sites

site_to_IP = (df_valid.set_index('site')['IP']).to_dict()
json.dump(site_to_IP, open('output/site_to_IP.json', 'w'))

## 2. Find CDN
- CDNnames
- CDNdomains

In [24]:
import CDNdomains

with open('CDNnames.csv', 'r') as f:
    CDNnames = f.read().splitlines()

### page parse data
- saved in output/homepage/
    - use requests to download website and save it
- count number of objects per url
    - load downloaded page and use BeautifulSoup to parse it
    - count sources and links for a, link, img, script tags on page
    - static objects like images and scripts are usually on website host IP

In [60]:
from bs4 import BeautifulSoup
import re
import urllib
 
def getnetloc(loc):
        if loc is None or loc is "":  # empty src or link instead of local src or link
            return None
        return urllib.parse.urlparse(loc).netloc

def count_netlocs(data):
    soup = BeautifulSoup(data)
    
    netlocs_static = []
    #netlocs_all = []

    # static
    tags = soup.findAll('script')
    for tag in tags:
        loc = tag.get('src')
        netlocs_static.append( getnetloc(loc) )

    tags = soup.findAll('img')
    for tag in tags:
        loc = tag.get('src')  # srcset should be counted only once for the src
        netlocs_static.append( getnetloc(loc) )

    tags = soup.findAll('a')
    for tag in tags:
        loc = tag.get('href')
        netlocs_static.append( getnetloc(loc) )
    
    """
    # all
    tags = soup.findAll('link')
    for tag in tags:
        loc = tag.get('href')
        netlocs_all.append( getnetloc(loc)

    tags = soup.findAll('meta')
    for tag in tags:
        loc = tag.get('content')
        netlocs_all.append( getnetloc(loc) )
    """  
    return Counter(netlocs_static)


In [None]:
import os

sites_list = list(sites['site'])
counter = {}
bad_site = []

for site in sites_list:
    
    file = 'output/homepage/'+site
    if os.path.exists(file):
        with open(file, 'r') as f:
            data = f.read()
            cnt = count_netlocs( data )
            del cnt[None]  # remove empty src NOT local src
            counter[site] = cnt.most_common()  # sort Counter
    else:
        bad_site.append(site)

### whois data
- saved in output/whoissite/ and output/whoisIP/
- sometimes Organization names contains the name of the registry or andminitrator. This should be ignored

In [None]:
import os

def loadwhoisIP(site):
    file = 'output/whoisIP/'+site
    if os.path.exists(file):
        f = open(file, 'r')
        whoisIP = f.read()
        f.close()
        return whoisIP
    else:
        print("Please perform whois for site %s" %site)
        return None

def loadwhoissite(site):
    file = 'output/whoissite/'+site
    if os.path.exists(file):
        f = open(file, 'r')
        whoissite = f.read()
        f.close()
        return whoissite
    else:
        print("Please perform whois for site %s" %site)
        return None

In [None]:
# search orgs for both whois IP and whois site
# search email only for whois IP

def searchOrg(whoisdata):
    
    Org = []
    OrgKeywords = ['organisation:', 'org:', 'organization:', 'org-name:', 'orgname:']
    exclude_orgs = [e.lower() for e in ['AFRINIC','APNIC','ARIN','LACNIC','IANA',
                'Asia Pacific Network Information Centre', 'Administered by RIPE NCC', 'RIPE NCC', 
                                    'Registration Association', 'VeriSign Global Registry Services'] ]
    if whoisdata is not None:
        for line in whoisdata.split('\n'):
            lower_line = line.lower()

            for keyword in OrgKeywords:
                if keyword in lower_line:
                    if not any([excl_org in lower_line for excl_org in exclude_orgs]):
                        org = line.split(':')[1].strip()
                        if not org in Org:
                            Org.append(org)
    return Org

def searchEmail(whoisdata):
    Email = []
    EmailKeywords = ['@', 'email:', 'mailbox:', 'e-mail:']
    #exclude_emails = [e.lower() for e in ['verisign-grs.com', 'verisigninc.com', 'Registrar'] ]

    if whoisdata is not None:
        for line in whoisdata.split('\n'):
            lower_line = line.lower()

            for keyword in EmailKeywords:
                if keyword in lower_line:
                    email = line.split('@')[1].strip()
                    if not email in Email:
                        Email.append(email)
                    
    return Email

In [27]:
#site_to_IP = df_valid.set_index('site')['IP'].to_dict()

# after parsing whois info from CDN_BGP_Analysis
#sites_org = pd.read_pickle('output/df_org_email_info.pkl')

#sites_org = df_valid['IP'].apply(lambda s: pd.Series( findOrg(s) ) )
#sites_cdn = df_valid.merge(sites_org, left_index=True, right_index=True )

### Estimate CDN
- compare site to known CDN domains
- compare most popular url that hosts objects on site homepage to known CDN domains
- compare organization name from whois to CDN names
- compare email address from whois to CDN domains

#### domain

In [None]:
def compareDomain(site):
    # TODO domain vs CDNdomains.(dict)
    return cdn

#### parse

In [30]:
def compareURL(url_counter):
    # TODO url: count vs CDNdomains.(dict)
    return cdn

#### whois

In [31]:
def compareOrg(org_list):
    # TODO if cdnname in org_list
    return cdn

def compareEmail(email_list):
    # TODO email vs cdndomain
    return cdn


def matchCDN(org_list):
    if len(org_list)>0:
        for org in org_list:
            org_lower = org.lower()
            for cdn in CDN_names:
                cdn_lower = cdn.lower()
                if (cdn_lower in org_lower):
                    return cdn
                
    else:
        return None

In [32]:
for site,IP in site_to_IP.items():
    
    cdn = compareDomain(site)
    if cdn is None:
        url_counter = 
        cdn = compareURL()
    
    whois1 = loadwhoisIP(site)
    Org1 = searchOrg(whois1)
    Email = searchEmail(whois1)
    
    whois2 = loadwhoissite(site)
    Org2 = searchOrg(whois2)
    
    #print(site, IP, Org1, Org2, Email)

NameError: name 'site_to_IP' is not defined

# 3. Get curl request timings