In [1]:
from __future__ import division
%pylab inline
import pandas as pd
import os
import json
from collections import defaultdict, Counter
%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib


# DATA
1. Get host IP and ASN for Alexa top 500 websites: use socket and pyasn (offline routeviews) libraries
- Find CDN of each website: use list of common CDNs and their related domains
    - Compare site to well known CDN domains (such as googleusercontent.com, cloudfront.net, etc.)
    - Parse homepage 
        - Parsing static objects on homepage to find number of resources per well known CDN domain
    - Parse whois
        - 'Organization' field might have the CDN name
        - site contact email servers used might be related to certain CDN domains
- Use curl to get timings for accessing sites: save response time for at least 20 requests per site
    - -v and --trace-time flags allow us to read timing information in real time
    - curl -w flag allows us to write time since the request was issued for name lookup, connection, SSL negotiation, and data reception
    - Calculate times:
        - t_dns = time for DNS resolution (no redirects) = time_namelookup - time_redirect  
        - t_tcp = time for TCP connection (SYN/SYNACK) = time_connect - time_namelookup  
        - t_ssl = time for SSL handshake (only if https) = time_appconnect - time_connect  
        - t_fbyte = time_starttransfer
        - t_wait = time between issuing GET request and first byte received = time_starttransfer - time_pretransfer  
        - t_rx = time to receive data from first to last byte = time_total - time_starttransfer

# Analysis
- 

In [2]:
df_sites = pd.read_csv('top-1m-new.csv', nrows=500, header = None, names = ['rank', 'site'])

## 1. Get IP and ASNs
- Get IP using socket.gethostbyname method
- Get ASN using pyasn (can also be done using whois, downloading routeviews data, etc.)

In [3]:
import socket

def getIP(s):
    try:
        IP = socket.gethostbyname(s)
        # ISP blocked domains return IP 49.207.46.6, 49.207.46.24, 49.207.46.34
        if IP in ['49.207.46.6', '49.207.46.24', '49.207.46.34']:
            print("Blocked site "+s)
            return False
        else:
            return IP
    except:
        print("Error accessing site "+s)
        return False

In [4]:
df_sites['IP'] = df_sites['site'].apply(getIP)

Blocked site pornhub.com
Blocked site ok.ru
Blocked site livejasmin.com
Blocked site xvideos.com
Error accessing site googleusercontent.com
Blocked site xhamster.com
Error accessing site exosrv.com
Blocked site xnxx.com
Blocked site chaturbate.com
Blocked site yts.am
Blocked site youporn.com
Blocked site 1337x.to
Error accessing site cloudfront.net
Blocked site redtube.com
Blocked site rutracker.org
Error accessing site banvenez.com
Error accessing site bp.blogspot.com
Error accessing site exdynsrv.com
Blocked site sex.com
Error accessing site wixsite.com


In [5]:
import pyasn

asndb = pyasn.pyasn('output/ipasn_20181212.dat')  #downloaded pyasn_util_download.py --latest
print(asndb.lookup('8.8.8.8'))

def findASN(ip):
    if ip:
        return asndb.lookup(ip)[0]
    else:
        return False

(15169, '8.8.8.0/24')


In [6]:
df_sites['ASN'] = df_sites['IP'].apply(findASN)

In [7]:
df_sites.head()

Unnamed: 0,rank,site,IP,ASN
0,1,google.com,172.217.160.238,15169
1,2,youtube.com,172.217.31.14,15169
2,3,facebook.com,157.240.25.35,32934
3,4,baidu.com,123.125.115.110,4808
4,5,wikipedia.org,103.102.166.224,14907


### Separate unblocked IPs and sites for analysis
- 20 of 500 sites are unreachable or blocked by the ISP
- Querying these sites and urls returns an IP at the edge of the ISP we're connected to
- Most of these sites are porn related or known for adware/malware

In [8]:
df_valid = df_sites[df_sites['IP'] != False]

print("Number of valid sites for further analysis: %s\n" % (len(df_valid)))

print("List of blocked sites: %s" % list(df_sites[df_sites['IP'] == False]['site']))

Number of valid sites for further analysis: 480

List of blocked sites: ['pornhub.com', 'ok.ru', 'livejasmin.com', 'xvideos.com', 'googleusercontent.com', 'xhamster.com', 'exosrv.com', 'xnxx.com', 'chaturbate.com', 'yts.am', 'youporn.com', '1337x.to', 'cloudfront.net', 'redtube.com', 'rutracker.org', 'banvenez.com', 'bp.blogspot.com', 'exdynsrv.com', 'sex.com', 'wixsite.com']


In [9]:
# save site_to_IP dictionary for valid sites
df_sites.to_pickle('output/df_sites.pkl')

site_to_IP = (df_valid.set_index('site')['IP']).to_dict()
json.dump(site_to_IP, open('output/site_to_IP.json', 'w'))

## 2. Find CDN
- load data

In [6]:
df_sites = pd.read_pickle('output/df_sites.pkl')

### Estimate CDN
- CDNnames
- CDNdomains
- compare site to known CDN domains
- compare most popular url that hosts objects on site homepage to known CDN domains
- compare organization name from whois to CDN names
- compare email address from whois to CDN domains

In [27]:
from CDNdomains import cdn_domains, cdn_names
from find_cdn_methods import find_cdn_by_site, find_cdn_by_url, find_cdn_by_counting_url, find_cdn_by_whois_org

#copy CDNnames to CDNdomains.py
#with open('CDNnames.csv', 'r') as f:
#    CDNnames = f.read().splitlines()
    
site_to_IP = json.load(open('output/site_to_IP.json', 'r'))

In [28]:
#import logging
#logging.basicConfig(filename='output/log_cdn_finder.log', level=logging.DEBUG, 
#                    format='%(asctime)s - %(levelname)s - %(message)s')
#logging.debug('This is a log message.')

In [29]:
# split find cdn into 2 functions
def cdn_parse_site(site):
    """wrapper func parsing site name and urls on home page"""
    # site in cdn or cdn in site urls
    matching_cdn = find_cdn_by_site(site) or find_cdn_by_url(site) or find_cdn_by_counting_url(site)
    
    if not matching_cdn:  # False or "" are both returned as False
        return False

    return matching_cdn

def cdn_parse_whois(site):
    """wrapper func calling whois"""
    # whois org is cdn
    matching_cdn = find_cdn_by_whois_org(site)
    
    if not matching_cdn:  # False or "" are both returned as False
        #matching_cdn = find_cdn_by_whois_site(site)
        return False

    return matching_cdn

def find_cdn(cdn_parsed, cdn_whois):
    """logic to decide which CDN finally"""        
    if cdn_whois in ['Fastly', 'Cloudflare', 'Akamai', 'Alibaba', 'Google LLC']:
        # google cloud customers have 'Google LLC' in whois
        matching_cdn = cdn_whois
    else:
        matching_cdn = cdn_parsed or cdn_whois    # prioritize parsed based matching except when above case
    return matching_cdn

In [19]:
probable_cdn = {}

for site,IP in site_to_IP.items():
    probable_cdn[site] = find_cdn(site)
    
    print(site, IP, probable_cdn[site])

google.com 172.217.160.238 Google
youtube.com 172.217.31.14 Google
facebook.com 157.240.25.35 Facebook
	baidu.com:  | 15: [('', 15)]
	whois baidu.com | [] | 
baidu.com 123.125.115.110 False
	wikipedia.org:  | 307: [('', 306), ('Google', 1)]
	whois wikipedia.org | ['ORG-WFI1-AP', 'Wikimedia Foundation, Inc.'] | 
wikipedia.org 103.102.166.224 False
	qq.com:  | 67: [('', 67)]
	whois qq.com | [] | 
qq.com 111.161.64.48 False
yahoo.com 72.30.35.10 Yahoo
	amazon.com: Amazon AWS | 104: [('Amazon AWS', 71), ('', 33)]
	whois amazon.com | [] | 
amazon.com 176.32.98.166 Amazon AWS
taobao.com 140.205.94.189 Taobao
	reddit.com:  | 29: [('', 29)]
	whois reddit.com | ['Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
reddit.com 151.101.129.140 Fastly
	tmall.com: Alibaba | 71: [('', 59), ('Alibaba', 12)]
	whois tmall.com | [] | 
tmall.com 140.205.130.99 Alibaba
google.co.in 172.217.166.195 Google
	twitter.com:  | 92: [('', 81), ('Twitter', 11)]
	whois twitter.com | ['Twitter Inc. (TWITT)', 'Twitter Inc.'] |

	espn.com: espncdn.com | 209: [('espncdn.com', 105), ('', 96), ('cdn.espn.com', 2), ('Facebook', 2), ('Google', 2), ('cdn.optimizely.com', 1), ('cdn.registerdisney.go.com', 1)]
	whois espn.com | ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
espn.com 54.76.191.85 espncdn.com
	amazon.in: Amazon AWS | 98: [('Amazon AWS', 75), ('', 23)]
	whois amazon.in | ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
amazon.in 52.95.120.67 Amazon AWS
	wikia.com: Fastly | 383: [('', 340), ('Fastly', 22), ('Facebook', 13), ('Google', 8)]
	whois wikia.com | ['Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
wikia.com 151.101.64.194 Fastly
	detail.tmall.com: Alibaba | 10: [('Alibaba', 7), ('', 3)]
	whois detail.tmall.com | [] | 
detail.tmall.com 121.42.17.239 Alibaba
	xinhuanet.com:  | 423: [('', 423)]
	whois xinhuanet.com | [] | 
xinhuanet.com 202.108.119.193 False
	quora.com: quoracdn.net | 28: [('', 24), ('qu

	walmart.com:  | 217: [('', 210), ('Google', 4), ('Facebook', 2), ('Twitter', 1)]
	whois walmart.com | ['Wal-Mart Stores, Inc. (WALMAR-Z)', 'Wal-Mart Stores, Inc.', 'Wal-Mart Stores, Inc. (WALMAR)'] | 
walmart.com 161.170.239.170 False
	dailymotion.com: dmcdn.net | 61: [('dmcdn.net', 31), ('', 29), ('Google', 1)]
	whois dailymotion.com | ['ORG-DS36-RIPE', 'Dailymotion S.A.', 'ORG-DM5-RIPE', 'Dailymotion'] | 
dailymotion.com 195.8.215.136 dmcdn.net
	nicovideo.jp: cdn.nimg.jp | 247: [('', 229), ('cdn.nimg.jp', 15), ('secure-dcdn.cdn.nimg.jp', 2), ('Facebook', 1)]
	whois nicovideo.jp | [] | 
nicovideo.jp 202.248.110.184 cdn.nimg.jp
	indeed.com:  | 14: [('', 13), ('Facebook', 1)]
	whois indeed.com | ['RIPE Network Coordination Centre (RIPE)', 'RIPE Network Coordination Centre'] | 
indeed.com 169.44.162.72 False
	craigslist.org:  | 256: [('', 255), ('Google', 1)]
	whois craigslist.org | ['Craigslist, Inc. (CRAIGS-5)', 'Craigslist, Inc.'] | 
craigslist.org 208.82.237.226 False
	cobalten.com:

	cnet.com:  | 110: [('', 107), ('Facebook', 2), ('Google', 1)]
	whois cnet.com | ['Sportsline.com (SPTL)', 'Sportsline.com'] | 
cnet.com 64.30.228.118 False
	patria.org.ve: cdn.patria.org.ve | 95: [('cdn.patria.org.ve', 52), ('', 33), ('Google', 9), ('Facebook', 1)]
	whois patria.org.ve | [] | 
patria.org.ve 190.205.112.68 cdn.patria.org.ve
	indiatimes.com:  | 396: [('', 389), ('Facebook', 6), ('cdn.onthe.io', 1)]
	whois indiatimes.com | [] | 
indiatimes.com 223.165.27.146 False
	tvbs.com.tw:  | 177: [('', 139), ('Facebook', 22), ('Google', 15), ('Amazon CloudFront', 1)]
	whois tvbs.com.tw | ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
tvbs.com.tw 13.35.130.97 Amazon Technologies Inc.
	jianshu.com: cdn2.jianshu.io | 55: [('', 32), ('cdn2.jianshu.io', 23)]
	whois jianshu.com | ['Zenlayer Inc (ZENLA-7)', 'Zenlayer Inc', 'UCloud (UCLOU)', 'UCloud'] | 
jianshu.com 107.150.101.156 cdn2.jianshu.io
	sogou.com: sogoucdn.com | 55: [('', 49),

doubleclick.net 172.217.166.206 Google
	reverso.net: cdn.reverso.net | 120: [('', 100), ('cdn.reverso.net', 17), ('Facebook', 3)]
	whois reverso.net | [] | 
reverso.net 89.107.171.183 cdn.reverso.net
	mega.nz:  | 17: [('', 16), ('Google', 1)]
	whois mega.nz | [] | 
mega.nz 31.216.148.10 False
	ltn.com.tw:  | 582: [('', 574), ('Google', 3), ('Facebook', 2), ('cdn.aralego.net', 1), ('Amazon CloudFront', 1), ('Cloudflare', 1)]
	whois ltn.com.tw | [] | 
ltn.com.tw 218.211.33.75 False
	yelp.com: yelpcdn.com | 123: [('', 73), ('yelpcdn.com', 44), ('Cloudflare', 4), ('Google', 2)]
	whois yelp.com | ['Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
yelp.com 151.101.76.116 Fastly
	rednet.cn:  | 11: [('', 11)]
	whois rednet.cn | [] | 
rednet.cn 115.159.159.215 False
	ci123.com:  | 883: [('', 883)]
	whois ci123.com | [] | 
ci123.com 202.102.90.109 False
	ups.com:  | 294: [('', 293), ('tiqcdn.com', 1)]
	whois ups.com | ['UNITED PARCEL SERVICE (UPS-9)', 'UNITED PARCEL SERVICE'] | 
ups.com 153.2.224.50 F

	kinopoisk.ru:  | 12: [('', 12)]
	whois kinopoisk.ru | [] | 
kinopoisk.ru 213.180.193.105 False
	doublepimp.com: aspnetcdn.com | 9: [('', 5), ('aspnetcdn.com', 3), ('Google', 1)]
	whois doublepimp.com | ['NV Next LLC (NNL-32)'] | 
doublepimp.com 69.89.69.121 aspnetcdn.com
	rt.com: cdni.rt.com | 74: [('', 52), ('cdni.rt.com', 12), ('Google', 6), ('Facebook', 4)]
	whois rt.com | ['ORG-KLAL1-AP', 'Kaspersky Labs Asia Ltd'] |  cdn SKY
rt.com 103.5.149.90 cdni.rt.com
	ifeng.com:  | 1427: [('', 1427)]
	whois ifeng.com | [] | 
ifeng.com 123.103.122.24 False
	orange.fr:  | 126: [('', 125), ('tiqcdn.com', 1)]
	whois orange.fr | [] | 
orange.fr 193.252.133.34 False
	redd.it:  | 22: [('', 21), ('Google', 1)]
	whois redd.it | ['Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
redd.it 151.101.193.140 Fastly
	zoho.com:  | 44: [('', 44)]
	whois zoho.com | ['Level 3 Parent, LLC (LPL-141)', 'Level 3 Parent, LLC', 'ZOHO (ZOHOC)', 'ZOHO'] |  cdn Level 3
zoho.com 8.40.222.155 Level 3
	whois nextoptim.com | ['Go

	aol.com: aolcdn.com | 406: [('', 331), ('aolcdn.com', 69), ('Yahoo', 3), ('Google', 2), ('Facebook', 1)]
	whois aol.com | ['Yahoo! (YAOO)', 'Yahoo!'] |  cdn Yahoo
aol.com 106.10.218.150 aolcdn.com
	outbrain.com:  | 62: [('', 60), ('Facebook', 1), ('Google', 1)]
	whois outbrain.com | ['Internap Corporation (IC-1425)', 'Internap Corporation'] |  cdn Internap
outbrain.com 64.74.232.52 Internap
	kakaku.com:  | 22: [('', 22)]
	whois kakaku.com | [] | 
kakaku.com 210.129.151.129 False
	youku.com: Alibaba | 883: [('', 868), ('Alibaba', 15)]
	whois youku.com | [] | 
youku.com 106.11.186.25 Alibaba
	evernote.com:  | 73: [('', 70), ('cdn.optimizely.com', 1), ('Facebook', 1), ('Google', 1)]
	whois evernote.com | ['Google LLC (GOOGL-2)', 'Google LLC'] |  cdn Google
evernote.com 35.190.29.187 Google
	behance.net:  | 1: [('', 1)]
	whois behance.net | ['Fastly (SKYCA-3)', 'Fastly'] |  cdn Fastly
behance.net 151.101.129.197 Fastly
	17ok.com:  | 360: [('', 360)]
	whois 17ok.com | [] | 
17ok.com 119.25

	idntimes.com: cdn.idntimes.com | 542: [('', 441), ('cdn.idntimes.com', 95), ('Facebook', 3), ('Google', 2), ('Amazon CloudFront', 1)]
	whois idntimes.com | [] | 
idntimes.com 103.5.51.123 cdn.idntimes.com
	files.wordpress.com: WordPress | 39: [('', 23), ('WordPress', 13), ('Google', 2), ('Facebook', 1)]
	whois files.wordpress.com | ['Automattic, Inc (AUTOM-93)', 'Automattic, Inc'] | 
files.wordpress.com 192.0.72.3 WordPress
	fiverr.com:  | 9: [('', 8), ('Google', 1)]
	whois fiverr.com | ['Cloudflare, Inc. (CLOUD14)', 'Cloudflare, Inc.'] |  cdn Cloudflare
fiverr.com 104.16.52.215 Cloudflare
	dkn.tv:  | 593: [('', 585), ('Google', 5), ('Facebook', 3)]
	whois dkn.tv | ['Cloudflare, Inc. (CLOUD14)', 'Cloudflare, Inc.'] |  cdn Cloudflare
dkn.tv 104.17.197.68 Cloudflare
	wix.com:  | 132: [('', 124), ('Google', 5), ('Facebook', 2), ('cdn.trackjs.com', 1)]
	whois wix.com | [] | 
wix.com 185.230.60.164 False
	siteadvisor.com:  | 8: [('', 8)]
	whois siteadvisor.com | ['McAfee, Inc. (MCAFE-2)', 

	yandex.kz:  | 138: [('', 138)]
	whois yandex.kz | [] | 
yandex.kz 5.255.255.5 False
	seasonvar.ru: cdn.seasonvar.ru | 78: [('', 71), ('cdn.seasonvar.ru', 7)]
	whois seasonvar.ru | [] | 
seasonvar.ru 178.236.137.57 cdn.seasonvar.ru
	ouedkniss.com:  | 151: [('', 148), ('Google', 2), ('Amazon CloudFront', 1)]
	whois ouedkniss.com | [] | 
ouedkniss.com 37.187.137.67 False
	goo.ne.jp:  | 386: [('', 381), ('adcdn.goo.ne.jp', 2), ('Google', 2), ('Facebook', 1)]
	whois goo.ne.jp | [] | 
goo.ne.jp 153.254.147.65 False
	marca.com: uecdn.es | 56: [('', 33), ('uecdn.es', 22), ('Google', 1)]
	whois marca.com | ['ORG-UE2-RIPE', 'ORG-GATI1-RIPE', 'Unidad Editorial S.A.'] | 
marca.com 193.110.128.82 uecdn.es
	smallpdf.com:  | 40: [('', 38), ('Amazon CloudFront', 1), ('Google', 1)]
	whois smallpdf.com | ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] |  cdn Amazon Technologies Inc.
smallpdf.com 143.204.254.204 Amazon Technologies Inc.
	bitly.com:  | 26: [('', 21), ('Amazon CloudFron

In [35]:
pd.Series(probable_cdn)

google.com                            Google
youtube.com                           Google
facebook.com                        Facebook
baidu.com                              False
wikipedia.org                          False
qq.com                                 False
yahoo.com                              Yahoo
amazon.com                        Amazon AWS
taobao.com                            Taobao
reddit.com                            Fastly
tmall.com                            Alibaba
google.co.in                          Google
twitter.com                          Twitter
live.com                           Microsoft
sohu.com                                Sohu
jd.com                                 False
yandex.ru                              False
google.co.jp                          Google
instagram.com                       Facebook
weibo.com                              False
sina.com.cn                            False
360.cn                                 False
login.tmal

# NOTES

- Contendo is part of Akamai but we haven't replaced it
- netflix content actually uses Open Connect, but the whois itself has all IPs on Amazon.
- instructure.com probably not on CDN, but whois detects Amazon
- nih.gov is on Amazon AWS even though all links are in fact local AS16509
- nextoptim.com detected as Google since we are redirected to google's home page but is actually a redirect virus
- homedepot.com 35.201.95.83 whois shows google cloud customer but might be Akamai's old customer (like apple?)
- airbnb.com should be using Fastly, but whois shows only Amazon, so detected as Amazon
- bitly.com uses amazon cloudfront probably but whois doesn't show. should have been detected in site url count.
- alipay.com uses alicdn.com but we are unable to grab urls as they are deep inside css style div backgroundimage strings
- office365.com is auth page not on CDN, but once logged in all data will be served by akamai. should this be included? added to CDN_domains.py
- twitch.tv will give twitchcdn.net if page parse first and fastly if whois first. twitchcdn.net belongs to fastly
- office.com will give Microfoft on whois and Akamai on pageparse
- cdn whois Amazon Technologies Inc. is infact Amazon EC2 instance of wepage. Amazon EC2 may not use Cloudfront to host content files, though its recomended if you want lot of content delivery
- <font color='red'>For analysis "Amazon Technologies Inc.", "Amazon AWS" and "Amazon Cloudfront", "Cloudfront" should all resolve to "Cloudfront"</font>
- whois site infact doesn't get used at all
- roblox.com: rbxcdn.com is actually Cogeco Peer 1 CDN network that was not in the list
- spotify has moved to google cloud

### TODO
- swap whois first and url count second to see results
- try to remove whois org2
- consider matching_cdn1 from parser and matching_cdn2 from whois - then select best option

# NOTES2: Blocked, redirects, and unreachable
- Blocked lookups at ISP
    - pornhub.com, livejasmin.com, xvideos.com, xhamster.com, xnxx.com, chaturbate.com, youporn.com, redtube.com, sex.com, 
    - yts.am : movie torrents
    - ok.ru
    - 1337x.to : movie torrents
    - rutracker.org : torrents
- Blocked malware (403 Forbidden)
    - exosrv.com : malware/adware
    - swiftviz.net : virus redirect/malware
    - i62e2b4mfy.com : malware/adware
    - vinfdv6b4j.com : virus redirect
    - s9kkremkr0.com : virus redirect
    - omumultation.club : virus popups
    - nextoptim.com : adware redirect popups
    - resentaticexhaus.info : possible malware, IP timeouts on ping, not 403
    - exdynsrv.com : adware redirect/malware, not 403
    - hotchedmothe.club: malware popup ads
    - digitaldsp.com : redirect virus, not 403
    - jf71qh5v14.com : virus/adware
    -
- No lookup or redirect
    - banvenez.com > e-bdvcpx.banvenez.com exists but can't be reached automatically
    
    
- Unreachable servers (possible CDNs)
    - microsoftonline.com : port 80 and 443 are probably closed
    - googleusercontent.com :  port 80 and 443 are probably closed
    - twimg.com : Twitter CDN by Twitter (twimg.com) blocked on port 80 and 443
    - cloudfront.net > hosted on Amazon aws as part of Amazon Cloudfront CDN. Blocked port 80 and 443. No DNS lookup.
    - bp.blogspot.com > hosts resources for blogspot. Blocked 80 443. Probably not CDN just a simple direct server.
        - https://2-bp.blogspot.com/2017/11/what-are-bpblogspotcom-links-and-how.html 
        
    
- Redirects
    - wikia.com > wikia.com/fandom
    - zhihu.com > zhihu.com/signup
    - fandom.com > fandom.wikia.com
        - contains fastly-insights.com widget scripts
    - rednet.cn > can't really reach this page on browser either
        - downloads a 200 response page saying redirect
     - ci123.com > same as above
         - 200 response redirect page that is very slow on browser was manually downloaded to ci123.com.html
- No curl (timeouts with curl not with requests)
    - usps.com
    - bestbuy.com
    - udemy.com > robot check
    - momoshop.com.tw > different source when viewed online vs by curl or requests
        - 1 small asia.creativecdn.com in script tag on end of page, most sources on momoshop.com.tw
        - manually download main page from browser as momoshop.com.tw.html
    - leboncoin.fr > blocked 403 when using requests/curl but available on browser
        - manually download main page from browser as leboncoin.fr.html
    - kissanime.ru > Please wait 5 seconds... doesn't work with minibrowsers
        - manually download main page from browser as kissanime.ru.html
    - roblox.com > gaming site that prechecks for cookies and scripts and tells not working with 200 code
        - manually download main page from browser as roblox.com.html
- Others
    - jianshu.com > cdn2.jianshu.com in < link href >
    - shopify.com > cdn.shopify.com in < link href >
    - xfinity.com > cdn.comcast.com
    - investing.com  > link href akamaized.net
    - discordapp.cdn > has a field CDN_HOST: 'cdn.discordapp.com' but not in static sources
    - bodelen.com > undetected redirect adware virus shows google page
    - netflix.com > video content is on Open-Connect CDN however homepage doesn't load any video content

In [114]:
#site_to_IP = df_valid.set_index('site')['IP'].to_dict()

# after parsing whois info from CDN_BGP_Analysis
#sites_org = pd.read_pickle('output/df_org_email_info.pkl')

#sites_org = df_valid['IP'].apply(lambda s: pd.Series( findOrg(s) ) )
#sites_cdn = df_valid.merge(sites_org, left_index=True, right_index=True )

# 3. Get curl request timings