In [234]:
from __future__ import division
%pylab inline
import pandas as pd
import os
import json
from collections import defaultdict, Counter
%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Number of sites per CDN and ASN

In [235]:
df_all = pd.read_pickle('output/df_sites.pkl')
df_asn_cdn = pd.read_pickle('output/df_asn_cdn.pkl')
df_site_cdn_method_map = pd.read_pickle('output/df_site_cdn_method_map.pkl')

In [236]:
df_asn_cdn.head()

Unnamed: 0,rank,site,IP,ASN,cdn_parsed,cdn_whois,cdn
0,1,google.com,172.217.160.238,15169,Google,Google,Google
1,2,youtube.com,172.217.31.14,15169,Google,Google,Google
2,3,facebook.com,157.240.25.35,32934,Facebook,Facebook,Facebook
3,4,baidu.com,123.125.115.110,4808,False,False,False
4,5,wikipedia.org,103.102.166.224,14907,False,False,False


## 1a. ASN Analysis
- Rank ASNs by number of sites

### Domain to IP and ASN
- use socket.gethostaddr() to get IP for domain
- use pyasn to get ASN (there are many methods and libraries to do this, pyasn lets you download routviews data)
- 480 of 500 websites are resolved to an IP address. We will be using these reachable sites in our analysis.

- Unreachable sites:
    - 20 websites are blocked either directly by ISP (porn, adware, etc.)
    - or we can't reach them due to bad DNS resolution/permissions (googleusercontent.com, cloudfront.net, etc.)

In [237]:
print("List of bad/unreachable sites: ",end=""),
print( list( df_all[ ( df_all['IP'] == False ) ]['site'] ) )

List of bad/unreachable sites: ['pornhub.com', 'ok.ru', 'livejasmin.com', 'xvideos.com', 'googleusercontent.com', 'xhamster.com', 'exosrv.com', 'xnxx.com', 'chaturbate.com', 'yts.am', 'youporn.com', '1337x.to', 'cloudfront.net', 'redtube.com', 'rutracker.org', 'banvenez.com', 'bp.blogspot.com', 'exdynsrv.com', 'sex.com', 'wixsite.com']


### Domains per ASN

- There are 190 unique ASNs serving 480 websites of the Alexa top 500.
    - <font color = 'red'>231 websites are served by 8 ASNs: Google (15169: 69 sites), ? (13335: 38 sites), Amazon AWS (16509: 36 sites), ? (54113: 30 sittes), Akamai (16625: 17 sites), Amazon AWS (14618: 17 sites), ? (37963: 13 sites), (4808: 11 sites)</font>
    - 37 ASNs serve 104 websites with each serving less than 10 but more than one.
    - 145 ASNs serve a website each.

- <font color = 'red'>There are x unique AS names</font>
    - TODO



In [238]:
num_sites_per_asn = df_asn_cdn.groupby([ 'ASN'])['rank'].count().sort_values(ascending = False)
many_sites = num_sites_per_asn[ num_sites_per_asn > 1]

In [239]:
print("Number of ASNs serving 480 sites of Alexa top 500 = ", len(num_sites_per_asn))

print("\nNumber of ASNs serving only one site each = ", len( num_sites_per_asn[ (num_sites_per_asn <= 1)] ))

print("\nNumber of ASNs serving more than one site but less than 10 = ", len( many_sites[many_sites < 10] ))
print("Number of sites served by above 37 ASNs == ", sum( many_sites[many_sites < 10] ))
                                                                                       
print("\nNumber of ASNs serving 10 or more websites each = ", len(many_sites[many_sites>=10]))
print("Number of sites served by above top 8 ASNs = ", sum(many_sites[many_sites>=10]))

print("\nTop ASNs catering to 10 or more websites each, and number of websites: ", end="")
print(many_sites[many_sites>10].to_dict())


Number of ASNs serving 480 sites of Alexa top 500 =  190

Number of ASNs serving only one site each =  145

Number of ASNs serving more than one site but less than 10 =  37
Number of sites served by above 37 ASNs ==  104

Number of ASNs serving 10 or more websites each =  8
Number of sites served by above top 8 ASNs =  231

Top ASNs catering to 10 or more websites each, and number of websites: {15169: 69, 13335: 38, 16509: 36, 54113: 30, 16625: 17, 14618: 17, 37963: 13, 4808: 11}


# TODO

In [240]:
# TODO
# import pyasn
# as_name_file = ??
# asndb = pyasn.pyasn('data/...', as_name_file)
# df_asn_cdn['AS_name'] = df_asn_cdn['ASN'].apply(asndb.get_as_name)
# df_asn_cdn.groupby(['AS_name'])['rank'].count().sort_values(ascending = False)

## 1b. CDN Analysis
- Filter the domains served by a CDN
- Rank CDN providers by number of sites

### Domain to CDN
- use a CDN names list from wikipedia and various websites
- use a CDN domains list from github that maps certain urls to the CDN served
- Three methods are used to finally determine the CDN used by the site (code in find_cdn_methods.py) 
    1. Check if site itself is well known CDN by comparing it to CDN names and CDN domains
    2. Counting sources on site:
        - Download and parse the site homepage (using requests and BeautifulSoup) and find the url sources of static objects (images and scripts) as well as links on the webpage.
        - Count objects per source url, and find the url with maximum or substantial traffic.
        - Compare url to well known CDN domains. If it can't be resolved.
    3. Parsing organization information for whois:
        - Most websites have CDN provider in the whois information since their publically resolved IPs belong to the CDN
        - Compare 'Organization' tag in whois to popular CDN names.
- At times, method 2 gives us a certain cdn url with many static object (for ex: twitchcdn.net for twitch.tv) while the whois tells us which CDN the site uses (ex: Fastly for twitch.tv). This allows us to associate the cdn url or domain with the CDN name to populate our CDN_domain dictionary for future testing.

In [293]:
df_asn_cdn[ df_asn_cdn['cdn']=='daumcdn.net' ]

Unnamed: 0,rank,site,IP,ASN,cdn_parsed,cdn_whois,cdn
153,154,daum.net,211.231.99.80,38099,daumcdn.net,False,daumcdn.net
291,292,tistory.com,211.231.108.151,38099,daumcdn.net,False,daumcdn.net


### Domains per CDNs
- 134 of 480 valid sites were not using CDNs.
    - The organization name in the whois record of these sites was either the site name itself or a hosting service that is not commonly known as a CDN provider.
    - Majority of static resources (images and scripts) were either hosted locally (instead of externally on a separate domain), or they were hosted on multiple domains. So we could not confirm a single domain or url carrying most of the traffic when loading the home page.


- 346 sites were using 101 (unconfirmed) CDNs. Of these 34 are confirmed and known CDNs while 66 are suspected CDNs.

- CONFIRMED: 34 CDNs were confirmed resolved to known CDNs from an offline list of CDN names and popular CDN domains. These 34 CDNs catered to 279 of 480 sites.
    - 4 of 34 known CDNs correspond to sites hosted on Amazon services counted individually (based on the returned whois records and urls: ['Amazon Technologies Inc.', 'Amazon AWS', 'Amazon.com, Inc.', 'Amazon CloudFront']). We have assumed that sites using Amazon hosting services are also using Cloudfront as their CDN (this may not be true in reality). Total sites catered by Amazon's 4 CDNs is 54.
    - 5 major CDNs with a large market share are ['Google', 'Cloudflare', 'Alibaba', 'Fastly', 'Akamai']. Total sites catered by these 5 major CDN providers is 180. Including Amazon, 234 sites were confirmed using these 6 major CDNs.
    - Total sites catered by 25 other known CDN providers (including 'MaxCDN', 'EdgeCast', 'Open-Connect', etc.) is 45. Some of these popular CDNs, such as 'Open-Connect', only cater to a single popular domain (netflix.com). Others (like Incapsula) cater to multiple sites (prothomalo.com, kooora.com) that are too low on the Alexa ranking.

- UNCONFIRMED: 66 URLs hosted majority of static objects (images and scripts) on the home page of the site. This behavior may indicate that it is a CDN, or it may be a static address that is not the same as the site we are checking. Of 66, only 'daumcdn.net' catered to two websites ('daum.net', 'tistory.com'), bringing the total to 67 sites.
    - We decided that there is a higher chance of such urls being a CDN if the url itself contains the string 'cdn'. Since this is not a confirmed method, we will continue the timing analysis of all 66 URLs considering them as unknown CDNs.
    - 37 of 66 URLs contained the string 'cdn', catering to 38 websites. For example: 'assets-cdn.github.com', 'cdn2.tstatic.net', 'line-scdn.net', etc.
    - 29 of 66 URLs do not contain the string 'cdn' and have a higher chance of being a static domain that doesn't match the site domain name and doesn't have a well known 'Organization' name in its whois. These 29 CDNs served 29 websites individually. This list contained 'www.googleadservices.com' (cobalten.com) and 'cfl.dropboxstatic.com' (dropbox.com), indicating that a majority of resources on these websites were hosted on the two URLs. While cobalten.com is not on Google Cloud, dropbox.com has a high chance of utilizing CDNs. Since this can't be confirmed we're analyzing all such sites with the assumption that they are on CDNs.


In [324]:
num_sites_per_cdn = df_asn_cdn.groupby('cdn')['rank'].count().sort_values(ascending=False)

print("Number of sites estimated not using CDNs = ", num_sites_per_cdn[False])
print("Total number of CDNs (unconfirmed) = ", len(num_sites_per_cdn))
print("Number of sites estimated using CDNs = ", (sum(num_sites_per_cdn) - num_sites_per_cdn[False]) )

Number of sites estimated not using CDNs =  134
Total number of CDNs (unconfirmed) =  101
Number of sites estimated using CDNs =  346


In [325]:
known_cdn = {}
unknown_cdn = {}
sum_dict = lambda v: sum([x for x in v.values() ])

for n, i in num_sites_per_cdn.items():
    if n:
        if n.lower() != n:
            # if some letters are capital, it should be resolved CDN.
            known_cdn[n] = i
        else:
            # very simple check for url: all letters are not capitalized.
            unknown_cdn[n] = i

print( "Number of known and resolved CDNs (based on whois and well known names): %s" %len(known_cdn) )
print( "Number of websites served by known and resolved CDNs: %s" % (sum_dict(known_cdn)) )

print("Number of unresolved or unknown URLs that seem like CDNs: %s" % (len(unknown_cdn)) )
print("Number of websites served by unresolved or unknown URLs that seem like CDNs: %s" % (sum_dict(unknown_cdn)) )

print("\nList of known CDNs and number of websites:\n\n", known_cdn)

print("\n\nList of unknown URLs that may or may not be CDNs:\n\n", list( unknown_cdn.keys()) )


Number of known and resolved CDNs (based on whois and well known names): 34
Number of websites served by known and resolved CDNs: 279
Number of unresolved or unknown URLs that seem like CDNs: 66
Number of websites served by unresolved or unknown URLs that seem like CDNs: 67

List of known CDNs and number of websites:

 {'Google': 70, 'Cloudflare': 39, 'Amazon Technologies Inc.': 32, 'Fastly': 30, 'Akamai': 29, 'Alibaba': 12, 'Amazon AWS': 12, 'Amazon.com, Inc.': 6, 'Microsoft': 5, 'Yahoo': 5, 'OVH': 4, 'Facebook': 4, 'Amazon CloudFront': 4, 'Twitter': 3, 'Incapsula': 2, 'LinkedIn CDN': 2, 'SoftLayer': 2, 'ChinaNetCenter': 2, 'Open-Connect (Netflix)': 1, 'WordPress': 1, 'Taobao': 1, 'StackPath': 1, 'Sohu': 1, 'Reflected Networks': 1, 'Orange': 1, 'Level 3': 1, 'Internap': 1, 'Instart Logic': 1, 'Hola': 1, 'HiNet': 1, 'Edgecast': 1, 'Comcast': 1, 'ChinaCache': 1, 'jsDelivr': 1}


List of unknown URLs that may or may not be CDNs:

 ['daumcdn.net', 'c5.rgstatic.net', 'awscdn.detik.net.id',

In [327]:
# simplistic logic to separate unknown sites
unknown_cdn_cnt = 0
unknown_url_cnt = 0

for n in unknown_cdn.keys():
    if 'cdn' in n:
        unknown_cdn_cnt += 1
    #elif n in ['aws.amazon.com', 'www.googleadservices.com']:
    #    known_cdn_cnt += 1
    else:
        unknown_url_cnt += 1
        #print (n)
        
print("Unknown CDNs based on url (contain string 'cdn'): %s\nUnknown urls that might not be CDNs: %s" \
      % (unknown_cdn_cnt, unknown_url_cnt))

df_asn_cdn[ df_asn_cdn['cdn'].isin( ['cfl.dropboxstatic.com', 'www.googleadservices.com'] ) ]

Unknown CDNs based on url (contain string 'cdn'): 37
Unknown urls that might not be CDNs: 29


Unnamed: 0,rank,site,IP,ASN,cdn_parsed,cdn_whois,cdn
80,81,dropbox.com,162.125.248.1,19679,cfl.dropboxstatic.com,False,cfl.dropboxstatic.com
150,151,cobalten.com,188.72.213.176,35415,www.googleadservices.com,False,www.googleadservices.com


In [317]:
only_one_site = num_sites_per_cdn[ num_sites_per_cdn <= 1]
many_sites = num_sites_per_cdn[ num_sites_per_cdn > 1]

print("Number of known CDNs (resolved) or URLs catering to majority static objects on one site's homepage:",
      len(only_one_site))

print("\nNumber of sites confirmed using CDNs (multiple sites per CDN) = ", (sum(many_sites) - many_sites[False]))
print("Number of sites using 1 CDN each (only one site per known CDN + 1 url that may be CDNs) = ", sum(only_one_site))

print("\nCDN providers (more than 1 site each) and number of sites:")
print(many_sites[ many_sites.index!=False ])

Number of known CDNs (resolved) or URLs catering to majority static objects on one site's homepage: 81

Number of sites confirmed using CDNs (multiple sites per CDN) =  265
Number of sites using 1 CDN each (only one site per known CDN + 1 url that may be CDNs) =  81

CDN providers (more than 1 site each) and number of sites:
cdn
Google                      70
Cloudflare                  39
Amazon Technologies Inc.    32
Fastly                      30
Akamai                      29
Alibaba                     12
Amazon AWS                  12
Amazon.com, Inc.             6
Microsoft                    5
Yahoo                        5
OVH                          4
Facebook                     4
Amazon CloudFront            4
Twitter                      3
Incapsula                    2
LinkedIn CDN                 2
SoftLayer                    2
daumcdn.net                  2
ChinaNetCenter               2
Name: rank, dtype: int64


In [342]:
# Count sites using 6 major well known CDNs that cater to most of the market

major_known_cdns = ['Google', 'Cloudflare', 'Alibaba', 'Fastly', 'Akamai']
amazon_cdns = ['Amazon Technologies Inc.', 'Amazon AWS', 'Amazon.com, Inc.', 'Amazon CloudFront']

major_cdn_site_cnt = 0
amazon_cdn_site_cnt = 0
other_cdn_cnt = 0
other_cdn_site_cnt = 0
for n, i in known_cdn.items():
    if n in major_known_cdns:
        major_cdn_site_cnt += i
    elif n in amazon_cdns:
        amazon_cdn_site_cnt += i
    else:
        other_cdn_cnt += 1
        other_cdn_site_cnt += i

print("List of Amazon CDN providers as resolved by whois: \n", amazon_cdns)
print("List of major CDN providers with most customer sites (excluding Amazon): \n", major_known_cdns, "\n")

print("Total sites catered by Amazon's %s CDNs is %s" % (len(amazon_cdns), amazon_cdn_site_cnt))
print("Total sites catered by %s other major CDN providers is %s" % (len(major_known_cdns), major_cdn_site_cnt))
print("Total sites catered by %s other known CDN providers is %s" % (other_cdn_cnt, other_cdn_site_cnt))

List of Amazon CDN providers as resolved by whois: 
 ['Amazon Technologies Inc.', 'Amazon AWS', 'Amazon.com, Inc.', 'Amazon CloudFront']
List of major CDN providers with most customer sites (excluding Amazon): 
 ['Google', 'Cloudflare', 'Alibaba', 'Fastly', 'Akamai'] 

Total sites catered by Amazon's 4 CDNs is 54
Total sites catered by 5 other major CDN providers is 180
Total sites catered by 25 other known CDN providers is 45


In [344]:
df_asn_cdn[ df_asn_cdn['cdn'].isin( ['Yahoo', 'Facebook', 'Incapsula'] ) ]

Unnamed: 0,rank,site,IP,ASN,cdn_parsed,cdn_whois,cdn
2,3,facebook.com,157.240.25.35,32934,Facebook,Facebook,Facebook
6,7,yahoo.com,72.30.35.10,26101,Yahoo,Yahoo,Yahoo
18,19,instagram.com,52.45.71.129,14618,Facebook,Amazon Technologies Inc.,Facebook
28,29,yahoo.co.jp,183.79.135.206,24572,Yahoo,Yahoo,Yahoo
79,80,fbcdn.net,157.240.13.35,32934,Facebook,Facebook,Facebook
230,231,messenger.com,157.240.13.14,32934,False,Facebook,Facebook
350,351,aol.com,106.10.218.150,56173,aolcdn.com,Yahoo,Yahoo
403,404,flickr.com,69.147.88.7,36088,Yahoo,Yahoo,Yahoo
445,446,prothomalo.com,107.154.248.36,19551,paloimages.prothom-alo.com,Incapsula,Incapsula
481,482,kooora.com,107.154.102.19,19551,False,Incapsula,Incapsula


# 1c. Association between CDN and ASN/AS name

In [203]:
# TODO

# 2. Site timings per CDN