In [1]:
from __future__ import division
%pylab inline
import pandas as pd
import os
import json
from collections import defaultdict, Counter
%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib


# DATA
1. Get host IP and ASN for Alexa top 500 websites: use socket and pyasn (offline routeviews) libraries
- Find CDN of each website: use list of common CDNs and their related domains
    - Compare site to well known CDN domains (such as googleusercontent.com, cloudfront.net, etc.)
    - Parse homepage 
        - Parsing static objects on homepage to find number of resources per well known CDN domain
    - Parse whois
        - 'Organization' field might have the CDN name
        - site contact email servers used might be related to certain CDN domains
- Use curl to get timings for accessing sites: save response time for at least 20 requests per site
    - -v and --trace-time flags allow us to read timing information in real time
    - curl -w flag allows us to write time since the request was issued for name lookup, connection, SSL negotiation, and data reception
    - Calculate times:
        - t_dns = time for DNS resolution (no redirects) = time_namelookup - time_redirect  
        - t_tcp = time for TCP connection (SYN/SYNACK) = time_connect - time_namelookup  
        - t_ssl = time for SSL handshake (only if https) = time_appconnect - time_connect  
        - t_fbyte = time_starttransfer
        - t_wait = time between issuing GET request and first byte received = time_starttransfer - time_pretransfer  
        - t_rx = time to receive data from first to last byte = time_total - time_starttransfer

# Analysis
- 

In [2]:
df_sites = pd.read_csv('data/top-1m-new.csv', nrows=500, header = None, names = ['rank', 'site'])

## 1. Get IP and ASNs
- Get IP using socket.gethostbyname method
- Get ASN using pyasn (can also be done using whois, downloading routeviews data, etc.)

In [None]:
import socket

def getIP(s):
    try:
        IP = socket.gethostbyname(s)
        # ISP blocked domains return IP 49.207.46.6, 49.207.46.24, 49.207.46.34
        if IP in ['49.207.46.6', '49.207.46.24', '49.207.46.34']:
            print("Blocked site "+s)
            return False
        else:
            return IP
    except:
        print("Error accessing site "+s)
        return False

In [None]:
df_sites['IP'] = df_sites['site'].apply(getIP)

In [None]:
import pyasn

asndb = pyasn.pyasn('data/ipasn_20181212.dat')  #downloaded pyasn_util_download.py --latest
print(asndb.lookup('8.8.8.8'))

def findASN(ip):
    if ip:
        return asndb.lookup(ip)[0]
    else:
        return False

In [None]:
df_sites['ASN'] = df_sites['IP'].apply(findASN)

In [None]:
df_sites.head()

### Separate unblocked IPs and sites for analysis
- 20 of 500 sites are unreachable or blocked by the ISP
- Querying these sites and urls returns an IP at the edge of the ISP we're connected to
- Most of these sites are porn related or known for adware/malware

In [None]:
df_valid = df_sites[df_sites['IP'] != False]

print("Number of valid sites for further analysis: %s\n" % (len(df_valid)))

print("List of blocked sites: %s" % list(df_sites[df_sites['IP'] == False]['site']))

In [None]:
# save site_to_IP dictionary for valid sites
df_sites.to_pickle('output/df_sites.pkl')

site_to_IP = (df_valid.set_index('site')['IP']).to_dict()
json.dump(site_to_IP, open('output/site_to_IP.json', 'w'))

## 2. Find CDN
- load data

In [40]:
df_sites = pd.read_pickle('output/df_sites.pkl')
df_valid = df_sites[df_sites['IP'] != False].copy()

### Estimate CDN
- CDNnames
- CDNdomains
- compare site to known CDN domains
- compare most popular url that hosts objects on site homepage to known CDN domains
- compare organization name from whois to CDN names
- compare email address from whois to CDN domains

In [20]:
from CDNdomains import cdn_domains, cdn_names
from find_cdn_methods import find_cdn_by_site, find_cdn_by_url, find_cdn_by_counting_url, find_cdn_by_whois_org

#copy CDNnames to CDNdomains.py
#with open('CDNnames.csv', 'r') as f:
#    CDNnames = f.read().splitlines()
    
site_to_IP = json.load(open('output/site_to_IP.json', 'r'))

In [21]:
#import logging
#logging.basicConfig(filename='output/log_cdn_finder.log', level=logging.DEBUG, 
#                    format='%(asctime)s - %(levelname)s - %(message)s')
#logging.debug('This is a log message.')

In [95]:
# split find cdn into 2 functions
def cdn_parse_site(site):
    """wrapper func parsing site name and urls on home page"""
    # site in cdn or cdn in site urls
    matching_cdn = find_cdn_by_site(site) or find_cdn_by_url(site) or find_cdn_by_counting_url(site)
    
    if not matching_cdn:  # False or "" are both returned as False
        return False

    return matching_cdn

def cdn_parse_whois(site):
    """wrapper func calling whois"""
    # whois org is cdn
    matching_cdn = find_cdn_by_whois_org(site)
    
    if not matching_cdn:  # False or "" are both returned as False
        #matching_cdn = find_cdn_by_whois_site(site)
        return False

    return matching_cdn

## Get CDN from both methods

In [96]:
df_valid['cdn_parsed'] = df_valid['site'].apply(cdn_parse_site)
df_valid['cdn_whois'] = df_valid['site'].apply(cdn_parse_whois)
df_valid['cdn'] = False

	baidu.com:  | 15: [('', 15)]
	wikipedia.org:  | 307: [('', 306), ('Google', 1)]
	qq.com:  | 67: [('', 67)]
	amazon.com: Amazon AWS | 104: [('Amazon AWS', 71), ('', 33)]
	reddit.com: www.redditstatic.com | 98: [('www.redditstatic.com', 69), ('', 29)]
	tmall.com: Alibaba | 71: [('', 59), ('Alibaba', 12)]
	twitter.com:  | 92: [('', 81), ('Twitter', 11)]
	live.com:  | 6: [('', 4), ('Microsoft', 1), ('Windows Azure', 1)]
	sohu.com: Sohu | 485: [('', 456), ('Sohu', 29)]
	jd.com:  | 135: [('', 135)]
	yandex.ru:  | 135: [('', 135)]
	weibo.com:  | 31: [('', 31)]
	sina.com.cn:  | 1806: [('', 1806)]
	360.cn:  | 273: [('', 272), ('qihucdn.com', 1)]
	login.tmall.com: Alibaba | 12: [('Alibaba', 9), ('', 3)]
	blogspot.com: Google | 83: [('Google', 42), ('', 41)]
	linkedin.com: LinkedIn CDN | 177: [('', 160), ('LinkedIn CDN', 17)]
	netflix.com: Open-Connect (Netflix) | 41: [('', 28), ('Open-Connect (Netflix)', 13)]
	yahoo.co.jp: Yahoo | 113: [('', 62), ('Yahoo', 51)]
	office.com: www.microsoft.com | 

	uol.com.br:  | 207: [('', 206), ('Google', 1)]
	globo.com: s2.glbimg.com | 664: [('', 397), ('s2.glbimg.com', 266), ('tiqcdn.com', 1)]
	flipkart.com: rukminim1.flixcart.com | 67: [('', 47), ('rukminim1.flixcart.com', 17), ('Google', 2), ('Facebook', 1)]
	wetransfer.com: prod-cdn.wetransfer.net | 27: [('', 15), ('prod-cdn.wetransfer.net', 12)]
	mercadolivre.com.br:  | 208: [('', 201), ('Google', 6), ('Facebook', 1)]
	godaddy.com:  | 262: [('', 259), ('Google', 2), ('Facebook', 1)]
	quizlet.com:  | 65: [('', 63), ('Google', 1), ('Facebook', 1)]
	sciencedirect.com: www.elsevier.com | 5: [('www.elsevier.com', 5)]
	mediafire.com:  | 39: [('', 36), ('Google', 1), ('cdn.ravenjs.com', 1), ('Facebook', 1)]
	caijing.com.cn: cdn.caijing.com.cn | 383: [('', 365), ('cdn.caijing.com.cn', 18)]
	gearbest.com: gbtcdn.com | 1167: [('', 1131), ('gbtcdn.com', 34), ('Facebook', 1), ('Google', 1)]
	163.com:  | 106: [('', 106)]
	china.com.cn:  | 586: [('', 586)]
	people.com.cn:  | 51: [('', 51)]
	freepik.co

	fedex.com:  | 250: [('', 248), ('Facebook', 1), ('Google', 1)]
	buzzfeed.com:  | 120: [('', 119), ('Google', 1)]
	gosuslugi.ru: gu-st.ru | 52: [('', 41), ('gu-st.ru', 11)]
	primevideo.com: Amazon AWS | 51: [('Amazon AWS', 34), ('', 17)]
	kinopoisk.ru:  | 12: [('', 12)]
	doublepimp.com: aspnetcdn.com | 9: [('', 5), ('aspnetcdn.com', 3), ('Google', 1)]
	rt.com: cdni.rt.com | 74: [('', 52), ('cdni.rt.com', 12), ('Google', 6), ('Facebook', 4)]
	ifeng.com:  | 1427: [('', 1427)]
	orange.fr: hp5.a.woopic.com | 187: [('', 125), ('hp5.a.woopic.com', 61), ('tiqcdn.com', 1)]
	redd.it: www.redditstatic.com | 140: [('www.redditstatic.com', 118), ('', 21), ('Google', 1)]
	zoho.com:  | 44: [('', 44)]
	fandom.com: fandom.wikia.com | 366: [('fandom.wikia.com', 302), ('Fastly', 22), ('', 21), ('Facebook', 13), ('Google', 8)]
	americanexpress.com:  | 346: [('', 346)]
	namnak.com:  | 97: [('', 95), ('Amazon CloudFront', 1), ('Google', 1)]
	yy.com: yyweb.yystatic.com | 58: [('', 23), ('yyweb.yystatic.com'

	wix.com:  | 132: [('', 124), ('Google', 5), ('Facebook', 2), ('cdn.trackjs.com', 1)]
	siteadvisor.com:  | 8: [('', 8)]
	zippyshare.com:  | 52: [('', 52)]
	icloud.com:  | 16: [('', 16)]
	ndtv.com: gadgets360cdn.com | 676: [('', 646), ('gadgets360cdn.com', 22), ('cdn.ndtv.com', 5), ('Facebook', 2), ('cdn.gadgets360.com', 1)]
	chinadaily.com.cn:  | 475: [('', 474), ('Facebook', 1)]
	factaholics.com:  | 123: [('', 120), ('Google', 1), ('Facebook', 1), ('jsDelivr', 1)]
	olx.ua:  | 6: [('', 6)]
	wiktionary.org:  | 160: [('', 160)]
	livedoor.com:  | 112: [('', 112)]
	pikabu.ru:  | 20: [('', 20)]
	3dmgame.com:  | 785: [('', 785)]
	uidai.gov.in:  | 126: [('', 107), ('Google', 16), ('Facebook', 3)]
	airbnb.com:  | 86: [('', 86)]
	icicibank.com:  | 88: [('', 87), ('Facebook', 1)]
	patreon.com:  | 59: [('', 53), ('Cloudflare', 2), ('Facebook', 2), ('cdn.ravenjs.com', 1), ('Google', 1)]
	shaparak.ir:  | 41: [('', 41)]
	upwork.com:  | 41: [('', 37), ('Google', 3), ('Facebook', 1)]
	leboncoin.fr:  |

	whois ask.com | ['NON-RIPE-NCC-MANAGED-ADDRESS-BLOCK', 'SKYCA-3', 'Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
	whois google.com.ua | ['GOOGLE', 'Google LLC (GOGL)', 'Google LLC'] |  cdn Google Cloud
	whois iqiyi.com | ['CHINANET-SH'] | 
	whois etsy.com | ['NON-RIPE-NCC-MANAGED-ADDRESS-BLOCK', 'SKYCA-3', 'Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
	whois 1688.com | ['Taobao'] |  cdn Taobao
	whois popads.net | ['TUT-NET-2', 'Total Uptime Technologies, LLC (TUTL-1)', 'Total Uptime Technologies, LLC'] | 
	whois google.com.vn | ['GOOGLE', 'Google LLC (GOGL)', 'Google LLC'] |  cdn Google Cloud
	whois panda.tv | ['ALISOFT'] | 
	whois theguardian.com | ['NON-RIPE-NCC-MANAGED-ADDRESS-BLOCK', 'SKYCA-3', 'Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
	whois vimeo.com | ['NON-RIPE-NCC-MANAGED-ADDRESS-BLOCK', 'SKYCA-3', 'Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
	whois avito.ru | ['RU-AVITO-SERVERS1'] | 
	whois google.co.za | ['GOOGLE', 'Google LLC (GOGL)', 'Google LLC'] |  cdn Google Cloud
	whois g

	whois chegg.com | ['AT-88-Z', 'Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
	whois wp.pl | ['WPPL'] | 
	whois skype.com | ['MSFT', 'Microsoft Corporation (MSFT)', 'Microsoft Corporation'] |  cdn Microsoft
	whois onet.pl | ['ONET-PL'] | 
	whois gmx.net | ['DE-SCHLUND-20030806', 'ORG-SA12-RIPE', '1&1 Internet SE'] | 
	whois macys.com | ['AKAMAI', 'Akamai Technologies, Inc. (AKAMAI)', 'Akamai Technologies, Inc.'] |  cdn Akamai China CDN
	whois exoclick.com | ['NL-OVH', 'ORG-OB14-RIPE', 'OVH BV'] |  cdn OVH
	whois gismeteo.ru | ['RU-MAPMAKERSGROUP-20160118', 'ORG-GL201-RIPE', '"MapMakers Group" Ltd'] | 
	whois ebay.it | ['EBAY-2', 'eBay, Inc (EBAY)', 'eBay, Inc'] | 
	whois chouftv.ma | ['CLOUDFLARENET', 'Cloudflare, Inc. (CLOUD14)', 'Cloudflare, Inc.'] |  cdn Cloudflare
	whois shopify.com | ['AT-88-Z', 'Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
	whois znanija.com | ['CLOUDFLARENET', 

In [97]:
all_valid_cdn_names = list( cdn_domains.values() ) + cdn_names
site_cdn_method_map = defaultdict(list)
i = 0

def estimate_cdn(site, cdn_parsed, cdn_whois):
    """logic to decide which CDN finally"""

    #if cdn_whois in ['Fastly', 'Cloudflare', 'CloudFront', 'Akamai', 'Alibaba', 'Google']:
    #    matching_cdn = cdn_whois
    
    if cdn_parsed and cdn_whois:  # confusion case when both exist
        if (cdn_parsed in all_valid_cdn_names):
            # known CDN not an unknown url
            matching_cdn = cdn_parsed
        else:
            matching_cdn = cdn_whois
            # note association between cdn_parsed of url type and cdn_whois
            global site_cdn_method_map
            site_cdn_method_map['site'].append(site)
            site_cdn_method_map['cdn_parsed'].append(cdn_parsed)
            site_cdn_method_map['cdn_whois'].append(cdn_whois)
            
    else:
        # assume cdn_whois has higher priority if cdn_parsed is an unknown cdn url - most will be amazon aws
        matching_cdn = cdn_whois or cdn_parsed
        
    """
    global i
    if cdn_parsed and cdn_whois:
        if cdn_parsed != cdn_whois:
            i += 1
            print(i, ". ", site, ":", matching_cdn, "<", cdn_parsed, "|", cdn_whois, ">")
    """
    
    return matching_cdn

In [98]:
df_valid['cdn'] = df_valid.apply(lambda row: estimate_cdn(row['site'], row['cdn_parsed'], row['cdn_whois']), axis=1)

In [99]:
df_site_cdn_method_map = pd.DataFrame(site_cdn_method_map)
df_site_cdn_method_map.to_pickle('output/df_site_cdn_method_map.pkl')

df_site_cdn_method_map

Unnamed: 0,site,cdn_parsed,cdn_whois
0,reddit.com,www.redditstatic.com,Fastly
1,office.com,www.microsoft.com,Microsoft
2,twitch.tv,twitchcdn.net,Fastly
3,t.co,twitter.com,Twitter
4,stackoverflow.com,cdn.sstatic.net,Fastly
5,porn555.com,porn555.agat-tech.com,Cloudflare
6,espn.com,espncdn.com,Amazon Technologies Inc.
7,quora.com,quoracdn.net,Amazon Technologies Inc.
8,bbc.com,www.bbc.co.uk,Fastly
9,spotify.com,scdn.co,Google


In [101]:
df_valid.to_pickle('output/df_asn_cdn.pkl')

# NOTES

- Contendo may be part of Akamai but we haven't replaced it
- instructure.com probably not on CDN, but whois detects Amazon
- nih.gov is on Amazon AWS even though all links are in fact local AS16509
- nextoptim.com detected as Google since we are redirected to google's home page but is actually a redirect virus
- homedepot.com 35.201.95.83 whois shows google cloud customer but might be Akamai's old customer (like apple?)
- airbnb.com should be using Fastly, but whois shows only Amazon, so detected as Amazon
- bitly.com uses amazon cloudfront probably but whois doesn't show. should have been detected in site url count.
- alipay.com uses alicdn.com but we are unable to grab urls as they are deep inside css style div backgroundimage strings
- office365.com is auth page not on CDN, but once logged in all data will be served by akamai. should this be included? added to CDN_domains.py
- twitch.tv will give twitchcdn.net if page parse first and fastly if whois first. twitchcdn.net belongs to fastly
- office.com will give Microfoft on whois and Akamai on pageparse
- cdn whois Amazon Technologies Inc. is infact Amazon EC2 instance of wepage. Amazon EC2 may not use Cloudfront to host content files, though its recomended if you want lot of content delivery
- spotify has moved to google cloud
- it seems tokopedia.com was Cloudfront but has moved to Alibaba Cloud (as seen on whois)

- <font color='red'>For analysis "Amazon Technologies Inc.", "Amazon AWS" and "Amazon Cloudfront", "Cloudfront" should all resolve to "Cloudfront"
- tripadvisor.com whois shows MCI Communications Services, Inc. d/b/a Verizon Business (MCICS) and resolves to Verizon as CDN name. In fact it is an entry for verizon EdgeCast. In future all Verizon should be resolved to "EdgeCase"
</font>

## Case for parsing page
- whois force.com  shows 'Internet Media Network', 'UltraDNS Corp'. UltraDNS is a DNS service probably not cdn. However it should be on Salesforce/Akamai as detected by parser
- whois researchgate.net and roblox.com show 'Cogeco Peer 1'. This CDN has not been detected by parser. Parser shows that its not a CDN site.
- whois popads.net shows Total Uptime Technologies. its cloud balancing service not CDN. popads.net should not be on cdn.
- <font color='red'>whois txxx.com and hclips.com gives cloudflare but page parsing pix-cdn.net is linked to something called 'Advanced Hosters CDN'. This is only copied from a previous file and is unconfirmed. Should we remove it???</font>

## TODO


# NOTES2: Blocked, redirects, and unreachable
- Blocked lookups at ISP
    - pornhub.com, livejasmin.com, xvideos.com, xhamster.com, xnxx.com, chaturbate.com, youporn.com, redtube.com, sex.com, 
    - yts.am : movie torrents
    - ok.ru
    - 1337x.to : movie torrents
    - rutracker.org : torrents
- Blocked malware (403 Forbidden)
    - exosrv.com : malware/adware
    - swiftviz.net : virus redirect/malware
    - i62e2b4mfy.com : malware/adware
    - vinfdv6b4j.com : virus redirect
    - s9kkremkr0.com : virus redirect
    - omumultation.club : virus popups
    - nextoptim.com : adware redirect popups
    - resentaticexhaus.info : possible malware, IP timeouts on ping, not 403
    - exdynsrv.com : adware redirect/malware, not 403
    - hotchedmothe.club: malware popup ads
    - digitaldsp.com : redirect virus, not 403
    - jf71qh5v14.com : virus/adware
    -
- No lookup or redirect
    - banvenez.com > e-bdvcpx.banvenez.com exists but can't be reached automatically
    
    
- Unreachable servers (possible CDNs)
    - microsoftonline.com : port 80 and 443 are probably closed
    - googleusercontent.com :  port 80 and 443 are probably closed
    - twimg.com : Twitter CDN by Twitter (twimg.com) blocked on port 80 and 443
    - cloudfront.net > hosted on Amazon aws as part of Amazon Cloudfront CDN. Blocked port 80 and 443. No DNS lookup.
    - bp.blogspot.com > hosts resources for blogspot. Blocked 80 443. Probably not CDN just a simple direct server.
        - https://2-bp.blogspot.com/2017/11/what-are-bpblogspotcom-links-and-how.html 
        
    
- Redirects
    - wikia.com > wikia.com/fandom
    - zhihu.com > zhihu.com/signup
    - fandom.com > fandom.wikia.com
        - contains fastly-insights.com widget scripts
    - rednet.cn > can't really reach this page on browser either
        - downloads a 200 response page saying redirect
     - ci123.com > same as above
         - 200 response redirect page that is very slow on browser was manually downloaded to ci123.com.html
- No curl (timeouts with curl not with requests)
    - usps.com
    - bestbuy.com
    - udemy.com > robot check
    - momoshop.com.tw > different source when viewed online vs by curl or requests
        - 1 small asia.creativecdn.com in script tag on end of page, most sources on momoshop.com.tw
        - manually download main page from browser as momoshop.com.tw.html
    - leboncoin.fr > blocked 403 when using requests/curl but available on browser
        - manually download main page from browser as leboncoin.fr.html
    - kissanime.ru > Please wait 5 seconds... doesn't work with minibrowsers
        - manually download main page from browser as kissanime.ru.html
    - roblox.com > gaming site that prechecks for cookies and scripts and tells not working with 200 code
        - manually download main page from browser as roblox.com.html
- Others
    - jianshu.com > cdn2.jianshu.com in < link href >
    - shopify.com > cdn.shopify.com in < link href >
    - xfinity.com > cdn.comcast.com
    - investing.com  > link href akamaized.net
    - discordapp.cdn > has a field CDN_HOST: 'cdn.discordapp.com' but not in static sources
    - bodelen.com > undetected redirect adware virus shows google page
    - netflix.com > video content is on Open-Connect CDN however homepage doesn't load any video content

In [None]:
#site_to_IP = df_valid.set_index('site')['IP'].to_dict()

# after parsing whois info from CDN_BGP_Analysis
#sites_org = pd.read_pickle('output/df_org_email_info.pkl')

#sites_org = df_valid['IP'].apply(lambda s: pd.Series( findOrg(s) ) )
#sites_cdn = df_valid.merge(sites_org, left_index=True, right_index=True )

# 3. Get curl request timings

In [46]:
df_asn_cdn = pd.read_pickle('output/df_asn_cdn.pkl')

In [102]:
df_asn_cdn.head()

Unnamed: 0,rank,site,IP,ASN,cdn_parsed,cdn_whois,cdn
0,1,google.com,172.217.160.238,15169,Google,Google,Google
1,2,youtube.com,172.217.31.14,15169,Google,Google,Google
2,3,facebook.com,157.240.25.35,32934,Facebook,Facebook,Facebook
3,4,baidu.com,123.125.115.110,4808,False,False,False
4,5,wikipedia.org,103.102.166.224,14907,False,False,False
