In [23]:
from __future__ import division
%pylab inline
import pandas as pd
import os
import json
from collections import defaultdict, Counter
%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# DATA
1. Get host IP and ASN for Alexa top 500 websites: use socket and pyasn (offline routeviews) libraries
- Find CDN of each website: use list of common CDNs and their related domains
    - Compare site to well known CDN domains (such as googleusercontent.com, cloudfront.net, etc.)
    - Parse homepage 
        - Parsing static objects on homepage to find number of resources per well known CDN domain
    - Parse whois
        - 'Organization' field might have the CDN name
        - site contact email servers used might be related to certain CDN domains
- Use curl to get timings for accessing sites: save response time for at least 20 requests per site
    - -v and --trace-time flags allow us to read timing information in real time
    - curl -w flag allows us to write time since the request was issued for name lookup, connection, SSL negotiation, and data reception
    - Calculate times:
        - t_dns = time for DNS resolution (no redirects) = time_namelookup - time_redirect  
        - t_tcp = time for TCP connection (SYN/SYNACK) = time_connect - time_namelookup  
        - t_ssl = time for SSL handshake (only if https) = time_appconnect - time_connect  
        - t_fbyte = time_starttransfer
        - t_wait = time between issuing GET request and first byte received = time_starttransfer - time_pretransfer  
        - t_rx = time to receive data from first to last byte = time_total - time_starttransfer

# Analysis
- 

In [2]:
df_sites = pd.read_csv('top-1m-new.csv', nrows=500, header = None, names = ['rank', 'site'])

## 1. Get IP and ASNs
- Get IP using socket.gethostbyname method
- Get ASN using pyasn (can also be done using whois, downloading routeviews data, etc.)

In [3]:
import socket

def getIP(s):
    try:
        IP = socket.gethostbyname(s)
        # ISP blocked domains return IP 49.207.46.6, 49.207.46.24, 49.207.46.34
        if IP in ['49.207.46.6', '49.207.46.24', '49.207.46.34']:
            print("Blocked site "+s)
            return False
        else:
            return IP
    except:
        print("Error accessing site "+s)
        return False

In [4]:
df_sites['IP'] = df_sites['site'].apply(getIP)

Blocked site pornhub.com
Blocked site ok.ru
Blocked site livejasmin.com
Blocked site xvideos.com
Error accessing site googleusercontent.com
Blocked site xhamster.com
Error accessing site exosrv.com
Blocked site xnxx.com
Blocked site chaturbate.com
Blocked site yts.am
Blocked site youporn.com
Blocked site 1337x.to
Error accessing site cloudfront.net
Blocked site redtube.com
Blocked site rutracker.org
Error accessing site banvenez.com
Error accessing site bp.blogspot.com
Error accessing site exdynsrv.com
Blocked site sex.com
Error accessing site wixsite.com


In [5]:
import pyasn

asndb = pyasn.pyasn('output/ipasn_20181212.dat')  #downloaded pyasn_util_download.py --latest
print(asndb.lookup('8.8.8.8'))

def findASN(ip):
    if ip:
        return asndb.lookup(ip)[0]
    else:
        return False

(15169, '8.8.8.0/24')


In [6]:
df_sites['ASN'] = df_sites['IP'].apply(findASN)

In [7]:
df_sites.head()

Unnamed: 0,rank,site,IP,ASN
0,1,google.com,172.217.160.238,15169
1,2,youtube.com,172.217.31.14,15169
2,3,facebook.com,157.240.25.35,32934
3,4,baidu.com,123.125.115.110,4808
4,5,wikipedia.org,103.102.166.224,14907


### Separate unblocked IPs and sites for analysis
- 20 of 500 sites are unreachable or blocked by the ISP
- Querying these sites and urls returns an IP at the edge of the ISP we're connected to
- Most of these sites are porn related or known for adware/malware

In [8]:
df_valid = df_sites[df_sites['IP'] != False]

print("Number of valid sites for further analysis: %s\n" % (len(df_valid)))

print("List of blocked sites: %s" % list(df_sites[df_sites['IP'] == False]['site']))

Number of valid sites for further analysis: 480

List of blocked sites: ['pornhub.com', 'ok.ru', 'livejasmin.com', 'xvideos.com', 'googleusercontent.com', 'xhamster.com', 'exosrv.com', 'xnxx.com', 'chaturbate.com', 'yts.am', 'youporn.com', '1337x.to', 'cloudfront.net', 'redtube.com', 'rutracker.org', 'banvenez.com', 'bp.blogspot.com', 'exdynsrv.com', 'sex.com', 'wixsite.com']


In [9]:
# save site_to_IP dictionary for valid sites
df_sites.to_pickle('output/df_sites.pkl')

site_to_IP = (df_valid.set_index('site')['IP']).to_dict()
json.dump(site_to_IP, open('output/site_to_IP.json', 'w'))

## 2. Find CDN
- CDNnames
- CDNdomains

### Estimate CDN
- compare site to known CDN domains
- compare most popular url that hosts objects on site homepage to known CDN domains
- compare organization name from whois to CDN names
- compare email address from whois to CDN domains

In [238]:
from CDNdomains import cdn_domain
from count_objects import fetch_homepage_url_count
from parse_whois import loadwhoisIP, loadwhoissite, searchOrg, searchEmail

with open('CDNnames.csv', 'r') as f:
    CDNnames = f.read().splitlines()
    
site_to_IP = json.load(open('output/site_to_IP.json', 'r'))

In [239]:
import logging
logging.basicConfig(filename='output/log_cdn_finder.log', level=logging.DEBUG, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('This is a log message.')

In [240]:
def find_cdn_by_site(site):
    """
    input site and cdn_domains dict
    search site in cdn_domains top down
    return cdn_name
    """
    for cdn_url in cdn_domain.keys():
        if site in cdn_url:
            # for short sites like t.co that are in all cdn_domains
            check_prefix, check_suffix = cdn_url.split(site, 1)  #split only once starting from left, atleast . prefix
            if (check_prefix[-1] == ".") or (check_prefix[-4:] == '.cdn'):  
                #only if site was complete, last char of prefix will be a '.' from cdn_domains
                print("\t\t\t\t\t"+site+" in "+cdn_url)
                return cdn_domain[cdn_url]
            print("\t\t\t\t\tError checking "+site+" in "+cdn_url)
    return False

def find_cdn_by_url(url):
    """
    input url from site and cdn_domains dict {cdn_url: cdn_name}
    for cdn_name in cdn_domains search cdn_name in url (not url in cdn_name)
    return cdn_name
    """
    for cdn_url in cdn_domain.keys():
        # add leading '.' to url if there is none
        if url[0] != '.':
            url = '.' + url
            
        if cdn_url in url:
            return cdn_domain[cdn_url]
    return False

def save_cdn_counter_data(site, cnt, cdn_cnt):
    if not os.path.exists('output/cdn_counter'):
        os.makedirs('output/cdn_counter')
    temp = {}
    temp['site'] = site
    temp['cdn_count'] = cdn_cnt
    temp['url_count'] = cnt
    
    with open('output/cdn_counter/'+site+'.json', 'w') as fout:
        json.dump(temp, fout)
    # read using temp = json.load(open(site.json))
    return
    
def find_cdn_by_counting_url(site):
    """finds probable cdn based on most count of url static resources"""
    cnt = fetch_homepage_url_count(site)
    cdn_cnt = defaultdict(int)
    
    if cnt:
        #print(site+":", end="")
        #print(cnt[:4])
        
        for url, url_cnt in cnt:

            if url == '':
                cdn_name = ''
            else:
                cdn_name = find_cdn_by_url(url)
                
                if not (cdn_name):
                    #if '.cdn' in url:
                    #    #split at first occurance of 'cdn' and print everything after
                    #    cdn_name = ''.join(url.partition('.cdn')[1:])  # unknown CDN to be resolved later
                    #elif 'cdn.' in url:
                    #    #split at last '.' before cdn (regex is better for this)
                    #    url_r = url[::-1]
                    #    cdn_name = ''.join(url.partition('.ndc')[1:])
                    if 'cdn' in url:
                        parts = url.split('.')
                        for i in range(len(parts)):
                            if 'cdn' in parts[i]:
                                break
                        cdn_name = '.'.join(parts[i:])
                    elif (site in url):
                        cdn_name = ''  # local site resources and links
                    else:
                        cdn_name = '?' # unknown external links

            cdn_cnt[cdn_name] += url_cnt
            
        #print(cdn_cnt)
        # sort by value into list [('cdn1': n1), ('cdn2': n2), ('?': n3), ...]
        cdn_cnt_sorted = sorted(cdn_cnt.items(), key=lambda x: x[1], reverse=True)
        save_cdn_counter_data(site, cnt, cdn_cnt_sorted)
        
        #remove "?" as option
        cdn_cnt_filtered = [ i for i in cdn_cnt_sorted if i[0]!='?' ]
        
        if len(cdn_cnt_filtered) > 0:
            #for c,n in cdn_cnt_filtered:
            #    if (site in c):
            #        return c

            sum_cdn_links = sum([ i[1] for i in cdn_cnt_filtered ])

            i = 0
            probable_cdn = cdn_cnt_filtered[i][0]
            # enter loop only if first choice is '' else return first choice
            while (probable_cdn == ''):
                i += 1
                if i < len(cdn_cnt_filtered):
                    probable_cdn = cdn_cnt_filtered[i][0]

                    if ( cdn_cnt_filtered[i][1] <= 2 ):
                        i = 0
                        probable_cdn = cdn_cnt_filtered[i][0]
                        break

                    if probable_cdn in ['Google', 'Facebook', 'Twitter', 'Amazon CloudFront']:
                        if ( cdn_cnt_filtered[i][1] >= sum_cdn_links/5 ):
                            # 1/10 was arbitarily decided but surprisingly was the threshold for deciding whatsapp, stackoverflow
                            break
                        else:
                            # if links to Google etc much less than sum then set back to first element regardless of ''
                            i = 0
                            probable_cdn = cdn_cnt_filtered[i][0]
                            break
                            # problem only if unknown cdn follows Google (ex: 9gag, whatsapp)


                else:
                    # if i>len set back to first element regardless of '' or [Google, fb, twitter] or whatever
                    i = 0
                    probable_cdn = cdn_cnt_filtered[i][0]
                    break

            print('\t'+site+': '+probable_cdn+' | ' + str(sum_cdn_links)+': ', end="")
            print(cdn_cnt_filtered)

            return probable_cdn
        
    return False

def find_cdn_by_whois_org(site):
    
    all_cdn_names = list( set(CDNnames + list(cdn_domain.values())) )
    
    whois1 = loadwhoisIP(site)
    Org1 = searchOrg(whois1)
    #Email = searchEmail(whois1)
    

    print("\twhois " + site + " | ", end="")
    print(Org1, end = " | ")
    #print("\t" + Org2)
    
    for org in Org1: #list( set(Org1+Org2) ):
        for cdn in all_cdn_names:
            if (org.lower() in cdn.lower()) or (cdn.lower() in org.lower()):
                print(" cdn "+cdn)
                return cdn
    print()        
    return False

def find_cdn_by_whois_site(site):
    
    all_cdn_names = list( set(CDNnames + list(cdn_domain.values())) )

    whois2 = loadwhoissite(site)
    Org2 = searchOrg(whois2)
    
    print("\twhois " + site + " | ", end="")
    print(Org2, end = " | ")
    
    for org in Org2: #list( set(Org1+Org2) ):
        for cdn in all_cdn_names:
            if (org.lower() in cdn.lower()) or (cdn.lower() in org.lower()):
                print(" cdn "+cdn)
                return cdn
    print()        
    return False

In [241]:
def find_cdn(site):
    """wrapper func"""
    # site in cdn or cdn in site urls
    matching_cdn = find_cdn_by_site(site) or find_cdn_by_url(site)
    
    if not matching_cdn:
        # by counting objects
        cdn_parsed = find_cdn_by_counting_url(site)
        # by matching whois organization
        cdn_whois = find_cdn_by_whois_org(site)
        
        if cdn_whois in ['Fastly', 'Cloudflare', 'Akamai', 'Alibaba', 'Google LLC']:
            # google cloud customers have 'Google LLC' in whois
            matching_cdn = cdn_whois
        else:
            matching_cdn = cdn_parsed or cdn_whois    # prioritize parsed based matching except when above case
        
        if not matching_cdn:
            matching_cdn = find_cdn_by_whois_site(site)
            
            if not matching_cdn:
                return False
    
    return matching_cdn

In [237]:
probable_cdn = {}

for site in site_to_IP.keys():
    probable_cdn[site] = find_cdn(site)
    
    print(site, IP, probable_cdn[site])

google.com 170.146.93.56 Google
youtube.com 170.146.93.56 Google
					facebook.com in .facebook.com
facebook.com 170.146.93.56 Facebook
	baidu.com:  | 15: [('', 15)]
	whois baidu.com | [] | 
	whois baidu.com | ['Beijing Baidu Netcom Science Technology Co., Ltd.'] | 
baidu.com 170.146.93.56 False
	wikipedia.org:  | 307: [('', 306), ('Google', 1)]
	whois wikipedia.org | ['ORG-WFI1-AP', 'Wikimedia Foundation, Inc.'] | 
	whois wikipedia.org | ['Public Interest Registry (PIR)', 'Afilias', 'Wikimedia Foundation, Inc.'] | 
wikipedia.org 170.146.93.56 False
	qq.com:  | 67: [('', 67)]
	whois qq.com | [] | 
	whois qq.com | ['Shenzhen Tencent Computer Systems CO.,Ltd'] | 
qq.com 170.146.93.56 False
					yahoo.com in .ay1.b.yahoo.com
yahoo.com 170.146.93.56 Yahoo
					Error checking amazon.com in .ssl-images-amazon.com
					Error checking amazon.com in .media-amazon.com
	amazon.com: Amazon AWS | 104: [('Amazon AWS', 71), ('', 33)]
	whois amazon.com | [] | 
amazon.com 170.146.93.56 Amazon AWS
					

	amazon.de: Amazon AWS | 104: [('Amazon AWS', 80), ('', 24)]
	whois amazon.de | [] | 
amazon.de 170.146.93.56 Amazon AWS
	pinterest.com:  | 56: [('', 53), ('Google', 2), ('Facebook', 1)]
	whois pinterest.com | ['Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
pinterest.com 170.146.93.56 Fastly
	amazon.co.uk: Amazon AWS | 105: [('Amazon AWS', 78), ('', 27)]
	whois amazon.co.uk | ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
amazon.co.uk 170.146.93.56 Amazon AWS
	adobe.com:  | 54: [('', 54)]
	whois adobe.com | ['Adobe Systems Incorporated (ADOBES-3)', 'Adobe Systems Incorporated'] | 
	whois adobe.com | ['Adobe Inc.', 'REDACTED FOR PRIVACY'] | 
adobe.com 170.146.93.56 False
					fbcdn.net in .fbcdn.net
fbcdn.net 170.146.93.56 Facebook
	dropbox.com:  | 45: [('', 45)]
	whois dropbox.com | ['Dropbox, Inc. (DROPB)', 'Dropbox, Inc.'] | 
	whois dropbox.com | ['Dropbox, Inc.'] | 
dropbox.com 170.146.93.56 False
	thestartmagazine.com:  | 12: [('', 1

	iqiyi.com:  | 514: [('', 514)]
	whois iqiyi.com | [] | 
	whois iqiyi.com | [] | 
iqiyi.com 170.146.93.56 False
	etsy.com:  | 96: [('', 93), ('Facebook', 2), ('cdn.ravenjs.com', 1)]
	whois etsy.com | ['Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
etsy.com 170.146.93.56 Fastly
	1688.com: Alibaba | 184: [('', 136), ('Alibaba', 48)]
	whois 1688.com | [] | 
1688.com 170.146.93.56 Alibaba
	popads.net:  | 42: [('', 42)]
	whois popads.net | ['Total Uptime Technologies, LLC (TUTL-1)', 'Total Uptime Technologies, LLC'] | 
	whois popads.net | ['Tomksoft S.A.'] | 
popads.net 170.146.93.56 False
google.com.vn 170.146.93.56 Google
	panda.tv: cdn.xiongmaoxingyan.com | 224: [('', 220), ('cdn.xiongmaoxingyan.com', 4)]
	whois panda.tv | [] | 
panda.tv 170.146.93.56 cdn.xiongmaoxingyan.com
	theguardian.com:  | 358: [('', 351), ('Google', 4), ('Facebook', 3)]
	whois theguardian.com | ['Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
theguardian.com 170.146.93.56 Fastly
	vimeo.com: vimeocdn.com | 64: [('', 45), 

	gearbest.com: gbtcdn.com | 1167: [('', 1131), ('gbtcdn.com', 34), ('Facebook', 1), ('Google', 1)]
	whois gearbest.com | ['Akamai International, BV (AIB-17)', 'Akamai International, BV', 'Akamai Technologies, Inc. (AKAMAI)', 'Akamai Technologies, Inc.'] |  cdn Akamai
gearbest.com 170.146.93.56 Akamai
	163.com:  | 106: [('', 106)]
	whois 163.com | [] | 
	whois 163.com | ['Guangzhou NetEase Computer System Co., Ltd'] | 
163.com 170.146.93.56 False
	china.com.cn:  | 586: [('', 586)]
	whois china.com.cn | [] | 
	whois china.com.cn | ['China Internet Network Information Center (CNNIC)'] | 
china.com.cn 170.146.93.56 False
google.nl 170.146.93.56 Google
google.com.pe 170.146.93.56 Google
	people.com.cn:  | 51: [('', 51)]
	whois people.com.cn | ['ChinaCache Global Network (CGN-20)', 'ChinaCache Global Network', 'Zenlayer Inc (ZENLA-7)', 'Zenlayer Inc'] |  cdn ChinaCache
people.com.cn 170.146.93.56 ChinaCache
google.com.ph 170.146.93.56 Google
	freepik.com: cdns2.freepik.com | 194: [('', 169),

	usps.com:  | 71: [('', 71)]
	whois usps.com | ['US Postal Service', 'United States Postal Service. (USPS-3)', 'United States Postal Service.'] | 
	whois usps.com | ['US Postal Service'] | 
usps.com 170.146.93.56 False
	hulu.com:  | 46: [('', 44), ('Akamai', 2)]
	whois hulu.com | ['Akamai Technologies, Inc. (AKAMAI)', 'Akamai Technologies, Inc.'] |  cdn Akamai
hulu.com 170.146.93.56 Akamai
	medium.com: cdn-images-1.medium.com | 91: [('', 79), ('cdn-images-1.medium.com', 6), ('cdn-static-1.medium.com', 4), ('Amazon CloudFront', 1), ('Google', 1)]
	whois medium.com | ['Cloudflare, Inc. (CLOUD14)', 'Cloudflare, Inc.'] |  cdn Cloudflare
medium.com 170.146.93.56 Cloudflare
	livejournal.com:  | 63: [('', 61), ('Facebook', 1), ('Google', 1)]
	whois livejournal.com | ['ORG-RT4-RIPE', 'Rambler Internet Holding LLC'] | 
	whois livejournal.com | [] | 
livejournal.com 170.146.93.56 False
	myshopify.com: cdn.shopify.com | 6: [('cdn.shopify.com', 3), ('', 3)]
	whois myshopify.com | ['Shopify, Inc. (

	liputan6.com: Akamai | 1749: [('', 805), ('Akamai', 768), ('Facebook', 85), ('Google', 84), ('cdn.onesignal.com', 1), ('cdns.klimg.com', 1), ('cdn-a.production.liputan6.static6.com', 1), ('cdn1-a.production.liputan6.static6.com', 1), ('cdn0-a.production.images.static6.com', 1), ('cdn1-a.production.images.static6.com', 1), ('cdn-a.production.vidio.static6.com', 1)]
	whois liputan6.com | ['Amazon Data Services Singapore (ADSS-3)', 'Amazon Data Services Singapore', 'Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
liputan6.com 170.146.93.56 Akamai
	momoshop.com.tw:  | 5: [('', 5)]
	whois momoshop.com.tw | [] | 
	whois momoshop.com.tw | ['Taiwan Network Information Center (TWNIC)'] | 
momoshop.com.tw 170.146.93.56 False
	weather.com:  | 70: [('', 58), ('Google', 7), ('cdn.polyfill.io', 2), ('cdn.emetriq.de', 1), ('Amazon CloudFront', 1), ('cdn.taboola.com', 1)]
	whois weather.com | ['Akamai Technologies, Inc. (AKAMAI)', 'Akamai Technologies,

	varzesh3.com:  | 104: [('', 103), ('Facebook', 1)]
	whois varzesh3.com | ['ORG-ART1-RIPE', 'Aria Shatel Company Ltd'] | 
	whois varzesh3.com | ['Domains By Proxy, LLC'] | 
varzesh3.com 170.146.93.56 False
google.co.il 170.146.93.56 Google
	zoom.us: Amazon CloudFront | 188: [('', 108), ('Amazon CloudFront', 62), ('Google', 17), ('Facebook', 1)]
	whois zoom.us | ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
zoom.us 170.146.93.56 Amazon CloudFront
	academia.edu:  | 33: [('', 29), ('cdn.pubwise.io', 2), ('Google', 1), ('maxcdn.bootstrapcdn.com', 1)]
	whois academia.edu | [] | 
	whois academia.edu | ['EDUCAUSE', 'VeriSign Global Registry'] | 
academia.edu 170.146.93.56 False
	gfycat.com:  | 150: [('', 150)]
	whois gfycat.com | ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
gfycat.com 170.146.93.56 Amazon Technologies Inc.
	gamepedia.com: cursecdn.com | 145: [('', 119), ('cursecdn.com', 

	slickdeals.net: slickdealscdn.com | 470: [('slickdealscdn.com', 447), ('', 17), ('Facebook', 3), ('Google', 2), ('acdn.adnxs.com', 1)]
	whois slickdeals.net | ['Instart Logic, Inc (IL-69)', 'Instart Logic, Inc'] |  cdn Instart Logic
slickdeals.net 170.146.93.56 slickdealscdn.com
	namu.wiki:  | 91: [('', 89), ('Google', 2)]
	whois namu.wiki | ['Cloudflare, Inc. (CLOUD14)', 'Cloudflare, Inc.'] |  cdn Cloudflare
namu.wiki 170.146.93.56 Cloudflare
	pixabay.com: cdn.pixabay.com | 59: [('', 37), ('cdn.pixabay.com', 20), ('Google', 1), ('Facebook', 1)]
	whois pixabay.com | ['ORG-HOA1-RIPE'] | 
pixabay.com 170.146.93.56 cdn.pixabay.com
	mercadolibre.com.mx:  | 205: [('', 198), ('Google', 6), ('Facebook', 1)]
	whois mercadolibre.com.mx | ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
mercadolibre.com.mx 170.146.93.56 Amazon Technologies Inc.
	kaskus.co.id:  | 218: [('', 209), ('Google', 6), ('Facebook', 2), ('Amazon CloudFront', 1)]
	whois ka

	factaholics.com:  | 123: [('', 120), ('Google', 1), ('Facebook', 1), ('jsDelivr', 1)]
	whois factaholics.com | ['Sucuri (SUCUR-2)', 'Sucuri'] | 
	whois factaholics.com | ['WhoisGuard, Inc.'] | 
factaholics.com 170.146.93.56 False
	olx.ua:  | 6: [('', 6)]
	whois olx.ua | ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
olx.ua 170.146.93.56 Amazon Technologies Inc.
	wiktionary.org:  | 160: [('', 160)]
	whois wiktionary.org | ['ORG-WFI1-AP', 'Wikimedia Foundation, Inc.'] | 
	whois wiktionary.org | ['Public Interest Registry (PIR)', 'Afilias', 'Wikimedia Foundation, Inc.'] | 
wiktionary.org 170.146.93.56 False
	livedoor.com:  | 112: [('', 112)]
	whois livedoor.com | [] | 
	whois livedoor.com | ['LINE Corporation', 'GMO Brights Consulting Inc.'] | 
livedoor.com 170.146.93.56 False
	pikabu.ru:  | 20: [('', 20)]
	whois pikabu.ru | ['ORG-FHS3-RIPE', 'Fornex Hosting S.L.'] | 
	whois pikabu.ru | ['Coordination Center for TLD RU', 'Technical Cent

	seasonvar.ru: cdn.seasonvar.ru | 78: [('', 71), ('cdn.seasonvar.ru', 7)]
	whois seasonvar.ru | [] | 
seasonvar.ru 170.146.93.56 cdn.seasonvar.ru
	ouedkniss.com:  | 151: [('', 148), ('Google', 2), ('Amazon CloudFront', 1)]
	whois ouedkniss.com | [] | 
	whois ouedkniss.com | ['OuedKniss.com'] | 
ouedkniss.com 170.146.93.56 False
	goo.ne.jp:  | 386: [('', 381), ('adcdn.goo.ne.jp', 2), ('Google', 2), ('Facebook', 1)]
	whois goo.ne.jp | [] | 
	whois goo.ne.jp | ['Japan Registry Services Co., Ltd.'] | 
goo.ne.jp 170.146.93.56 False
	marca.com: uecdn.es | 56: [('', 33), ('uecdn.es', 22), ('Google', 1)]
	whois marca.com | ['ORG-UE2-RIPE', 'ORG-GATI1-RIPE', 'Unidad Editorial S.A.'] | 
marca.com 170.146.93.56 uecdn.es
	smallpdf.com:  | 40: [('', 38), ('Amazon CloudFront', 1), ('Google', 1)]
	whois smallpdf.com | ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
smallpdf.com 170.146.93.56 Amazon Technologies Inc.
	bitly.com:  | 26: [('', 21), ('Am

In [242]:
probable_cdn

{'google.com': 'Google',
 'youtube.com': 'Google',
 'facebook.com': 'Facebook',
 'baidu.com': False,
 'wikipedia.org': False,
 'qq.com': False,
 'yahoo.com': 'Yahoo',
 'amazon.com': 'Amazon AWS',
 'taobao.com': 'Taobao',
 'reddit.com': 'Fastly',
 'tmall.com': 'Alibaba',
 'google.co.in': 'Google',
 'twitter.com': 'Twitter',
 'live.com': 'Microsoft',
 'sohu.com': 'Sohu',
 'jd.com': False,
 'yandex.ru': False,
 'google.co.jp': 'Google',
 'instagram.com': 'Facebook',
 'weibo.com': False,
 'sina.com.cn': False,
 '360.cn': False,
 'login.tmall.com': 'Alibaba',
 'blogspot.com': 'Google',
 'google.com.hk': 'Google',
 'linkedin.com': 'LinkedIn CDN',
 'netflix.com': 'Open-Connect (Netflix)',
 'google.com.br': 'Google',
 'yahoo.co.jp': 'Yahoo',
 'office.com': 'Microsoft',
 'microsoftonline.com': 'Microsoft',
 'google.co.uk': 'Google',
 'csdn.net': False,
 'vk.com': False,
 'google.fr': 'Google',
 'mail.ru': False,
 'google.de': 'Google',
 'pages.tmall.com': 'Alibaba',
 'aliexpress.com': 'Alibaba'

# NOTES

- Contendo is part of Akamai but we haven't replaced it
- netflix content actually uses Open Connect, but the whois itself has all IPs on Amazon.
- instructure.com probably not on CDN, but whois detects Amazon
- nih.gov is on Amazon AWS even though all links are in fact local AS16509
- nextoptim.com detected as Google since we are redirected to google's home page but is actually a redirect virus
- homedepot.com 35.201.95.83 whois shows google cloud customer but might be Akamai's old customer (like apple?)
- airbnb.com should be using Fastly, but whois shows only Amazon, so detected as Amazon
- bitly.com uses amazon cloudfront probably but whois doesn't show. should have been detected in site url count.
- alipay.com uses alicdn.com but we are unable to grab urls as they are deep inside css style div backgroundimage strings
- office365.com is auth page not on CDN, but once logged in all data will be served by akamai. should this be included? added to CDN_domains.py
- twitch.tv will give twitchcdn.net if page parse first and fastly if whois first. twitchcdn.net belongs to fastly
- office.com will give Microfoft on whois and Akamai on pageparse
- cdn whois Amazon Technologies Inc. is infact Amazon EC2 instance of wepage. Amazon EC2 may not use Cloudfront to host content files, though its recomended if you want lot of content delivery
- <font color='red'>For analysis "Amazon Technologies Inc.", "Amazon AWS" and "Amazon Cloudfront", "Cloudfront" should all resolve to "Cloudfront"</font>
- whois site infact doesn't get used at all
- roblox.com: rbxcdn.com is actually Cogeco Peer 1 CDN network that was not in the list
- spotify has moved to google cloud

### TODO
- swap whois first and url count second to see results
- try to remove whois org2
- consider matching_cdn1 from parser and matching_cdn2 from whois - then select best option

In [None]:
#import os

sites_list = list(site_to_IP.keys())
counter = {}
bad_sites = []

for site in sites_list:
    
    file = 'output/homepage/'+site
    altfile = 'output/homepage/'+site+'.html'
    
    if os.path.exists(file):
        print(file+': ', end=""),
        with open(file, 'r') as f:
            data = f.read()
        cnt = count_netlocs( data )
        del cnt[None]  # remove empty src NOT local src
        counter[site] = cnt.most_common()  # sort Counter
        print(len(cnt))
    elif os.path.exists(altfile):
        print(altfile+': ', end=""),
        with open(altfile, 'r', encoding="utf8", errors='ignore') as f:
            data = f.read()
        cnt = count_netlocs( data )
        del cnt[None]  # remove empty src NOT local src
        counter[site] = cnt.most_common()  # sort Counter
        print(len(cnt))
    else:
        bad_sites.append(site)
        
print(bad_sites)

In [None]:
counter

# <font color='red'> TODO </font>
- compare website name to popular CDN domains
- compare counter per site to CDN domain

### whois data
- saved in output/whoissite/ and output/whoisIP/
- sometimes Organization names contains the name of the registry or andminitrator. This should be ignored

In [114]:
#site_to_IP = df_valid.set_index('site')['IP'].to_dict()

# after parsing whois info from CDN_BGP_Analysis
#sites_org = pd.read_pickle('output/df_org_email_info.pkl')

#sites_org = df_valid['IP'].apply(lambda s: pd.Series( findOrg(s) ) )
#sites_cdn = df_valid.merge(sites_org, left_index=True, right_index=True )

#### domain

In [None]:
def compareDomain(site):
    # TODO domain vs CDNdomains.(dict)
    return cdn

#### parse

In [None]:
def compareURL(url_counter):
    # TODO url: count vs CDNdomains.(dict)
    return cdn

#### whois

In [None]:
def compareOrg(org_list):
    # TODO if cdnname in org_list
    return cdn

def compareEmail(email_list):
    # TODO email vs cdndomain
    return cdn


def matchCDN(org_list):
    if len(org_list)>0:
        for org in org_list:
            org_lower = org.lower()
            for cdn in CDN_names:
                cdn_lower = cdn.lower()
                if (cdn_lower in org_lower):
                    return cdn
                
    else:
        return None

In [115]:
for site,IP in site_to_IP.items():
    
    whois1 = loadwhoisIP(site)
    Org1 = searchOrg(whois1)
    Email = searchEmail(whois1)
    
    whois2 = loadwhoissite(site)
    Org2 = searchOrg(whois2)
    
    print(site, IP, Org1, Org2, Email)

google.com 172.217.160.238 ['Google LLC (GOGL)', 'Google LLC'] ['Google LLC'] ['google.com']
youtube.com 172.217.31.14 ['Google LLC (GOGL)', 'Google LLC'] ['Google LLC'] ['google.com']
facebook.com 157.240.25.35 ['Facebook, Inc. (THEFA-3)', 'Facebook, Inc.'] ['Facebook, Inc.'] ['facebook.com']
baidu.com 123.125.115.110 [] ['Beijing Baidu Netcom Science Technology Co., Ltd.'] ["chinaunicom.cn'", 'baidu.com.cn']
wikipedia.org 103.102.166.224 ['ORG-WFI1-AP', 'Wikimedia Foundation, Inc.'] ['Public Interest Registry (PIR)', 'Afilias', 'Wikimedia Foundation, Inc.'] ["wikimedia.org'", 'wikimedia.org']
qq.com 111.161.64.48 [] ['Shenzhen Tencent Computer Systems CO.,Ltd'] ["chinaunicom.cn'", 'chinaunicom.cn']
yahoo.com 72.30.35.10 ['Yahoo! Inc. (YHOO)', 'Yahoo! Inc.'] ['Oath Inc.'] ['yahoo-inc.com', 'cc.yahoo-inc.com']
amazon.com 176.32.98.166 [] ['Amazon Technologies, Inc.'] ["amazon.com'"]
taobao.com 140.205.94.189 [] ['Zhejiang Taobao Network Limited (浙江淘宝网络有限公司)'] ['apnic.net', "cnnic.cn'",

craigslist.org 208.82.237.226 ['Craigslist, Inc. (CRAIGS-5)', 'Craigslist, Inc.'] ['Public Interest Registry (PIR)', 'Afilias', 'Craigslist'] ['craigslist.org']
cobalten.com 188.72.213.176 [] ['GLOBAL DOMAIN PRIVACY SERVICES INC'] ["as5577.net'", 'webzilla.com"']
google.co.ve 172.217.160.227 ['Google LLC (GOGL)', 'Google LLC'] ['Comisión Nacional de Telecomunicaciones (CONATEL)'] ['google.com']
slideshare.net 108.174.11.74 ['LinkedIn Corporation (LINKE-1)', 'LinkedIn Corporation'] ['LinkedIn Corporation'] ['linkedin.com']
daum.net 211.231.99.80 [] ['Kakao Corp.', ''] ["nic.or.kr'", 'nic.or.kr', 'daouidc.com']
amazon.it 52.95.116.114 ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] ['IIT - CNR', 'Amazon Europe Core S.à.r.l.', 'Amazon.com, Inc.', 'Hogan Lovells (Paris) LLP'] ['amazon.com', 'amazonaws.com']
google.co.ao 172.217.160.227 ['Google LLC (GOGL)', 'Google LLC'] ['Faculdade de Engenharia da Universidade Agostinho Neto', 'Associação DNS.PT (DNS.PT)'] ['google.com

breitbart.com 35.241.35.213 ['Google LLC (GOOGL-2)', 'Google LLC'] ['Domains By Proxy, LLC'] ['google.com)', 'google.com']
wordreference.com 64.251.31.226 ['Infolink Global Corporation (IGC-44)', 'Infolink Global Corporation'] ['WhoisGuard, Inc.'] ['infolink.com', 'serverpronto.net']
mailchimp.com 104.122.13.191 ['Akamai Technologies, Inc. (AKAMAI)', 'Akamai Technologies, Inc.'] ['ROCKET SCIENCE GROUP', 'The Rocket Science Group, LLC'] ['akamai.com']
bet365.com 5.226.176.16 [] ['bet365 Group Limited'] ["hillsidenewmedia.com'"]
google.ie 172.217.31.3 ['Google LLC (GOGL)', 'Google LLC'] ['University College Dublin', 'Computing Services', 'Computer Centre', 'IE Domain Registry Limited'] ['google.com']
fedex.com 204.135.8.175 ['FedEx (FEC)', 'FedEx'] ['FedEx'] ['network.fedex.com', 'fedex.com']
buzzfeed.com 52.84.104.144 ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] ['BuzzFeed, Inc.'] ['amazon.com', 'amazonaws.com']
gosuslugi.ru 109.207.1.97 ['ORG-JR8-RIPE', 'PJSC Rost

prothomalo.com 107.154.248.36 ['Incapsula Inc (INCAP-5)', 'Incapsula Inc'] [''] ['imperva.com', 'incapsula.com']
divar.ir 79.175.141.110 [] ['Institute for Research in Fundamental Sciences', 'Agah Pardazan Hooshmand'] ["afranet.com'"]
prezi.com 35.168.174.197 ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] ['Whois Privacy Service'] ['amazonaws.com', 'amazon.com']
google.sk 172.217.166.195 ['Google LLC (GOGL)', 'Google LLC'] ['SK-NIC, a.s.', 'Office of Deputy Prime Minister of the Slovak Republic for Investments and Informatization', 'FAJNOR IP s. r. o.', 'Google Ireland Holdings Unlimited Company'] ['google.com']
termometropolitico.it 104.25.180.31 ['Cloudflare, Inc. (CLOUD14)', 'Cloudflare, Inc.'] ['IIT - CNR', 'Gianluca Borrelli', 'Aruba s.p.a.'] ['cloudflare.com']
okezone.com 202.80.220.218 [] ['PT Linktone Indonesia'] ["idnic.net'", 'top.net.id', 'idnic.net']
investing.com 185.94.84.100 [] ['Fusion Media Limited'] ["investing.com'"]
free.fr 212.27.48.10 [] ['Asso

# 3. Get curl request timings