In [4]:
from __future__ import division
%pylab inline
import pandas as pd
import subprocess
import os, sys, re
from collections import defaultdict
#import matplotlib
#matplotlib.use('Agg')
#%matplotlib nbagg
#import matplotlib.pyplot as plt

Populating the interactive namespace from numpy and matplotlib


# Challenge

Write a Python program (OSX or Linux preferred) that queries to the top 500 Alexa domains/sites:
- http://www.alexa.com/topsites
- https://www.dropbox.com/s/pqsimknj77ywqbn/top-1m.csv.tar.gz?dl=0

### CDN analysis

- Filter the domains served by a CDN (e.g., Akamai)
- Rank CDN providers by number of sites
- Calculate the average response time for the index page of each site (aka Time To First Byte) per CDN provider, and rank them by speed (separate DNS resolution, TCP connection, SSL negotiation and receive time ideally);

### BGP analysis

- Determine the ASN that each of the 500 sites maps to (based on hosting IP address)
- Rank ASNs by number of sites

In [3]:
def getCDF(data):
    xdata = np.sort(data)
    ydata = [i/len(xdata) for i in range(len(xdata))]
    return xdata, ydata

# CDN Analysis

## Load Data
load top 500 alexa websites from csv

In [236]:
#sites = pd.read_csv('top-1m.csv', nrows=500, header = None, names = ['rank', 'url'])
sites = pd.read_csv('top-1m-new.csv', nrows=500, header = None, names = ['rank', 'site'])

## Get CDN for website
- using whois on IP address (via nslookup for domain)
- extract fields from DNS headers
- checking IP from multiple server locations
- downloading all objects on the website homepage

In [5]:
site = sites['site'][4]
site

'wikipedia.org'

### get IP address using host -t a <site\>

In [6]:
def getIP(site):
    # use host command to get IP for site (domain url netloc)
    # use timeout of 5.0 s for sites that are firewalled and can't be looked up
    try:
        out = subprocess.check_output(['host', '-t', 'a', site], stderr=subprocess.STDOUT, timeout=5.0).decode('UTF-8')
        # set site IP as first IP address returned by host command - using split is more efficient than regex
        site_IP = out.split('\n')[0].split(" ")[-1]
    except:
        print('host '+site+' process ran too long')
        site_IP = ''
    return site_IP

In [7]:
getIP('google.com')

'172.217.167.46'

In [8]:
sites['IP_addr'] = sites['site'].apply(getIP)

host pornhub.com process ran too long
host ok.ru process ran too long
host livejasmin.com process ran too long
host xvideos.com process ran too long
host xhamster.com process ran too long
host xnxx.com process ran too long
host chaturbate.com process ran too long
host yts.am process ran too long
host youporn.com process ran too long
host 1337x.to process ran too long
host redtube.com process ran too long
host rutracker.org process ran too long
host sex.com process ran too long


### ensure IP is valid 

In [9]:
import ipaddress

def validateIP(site_IP):
    try:
        ipaddress.ip_address(site_IP)
        return True
    except:
        return False

In [10]:
sites['isValidIP'] = sites['IP_addr'].apply(validateIP)

### find organization and email servers using whois <IPADDR\>

In [18]:
import subprocess

def findOrg(site_IP):
    # use whois <IPADDR> to find and parse the organization and email server of site_IP
    # return dict {Org, OrgName, Email}
    
    Org = []
    OrgName = []
    EmailServer = []
    exclude_orgs = ['AFRINIC','APNIC','ARIN','LACNIC','IANA', 'Asia Pacific Network Information Centre']
    
    try:
        out2 = subprocess.check_output(['whois', site_IP], stderr=subprocess.STDOUT, timeout=5.0).decode('UTF-8')

        for line in out2.split('\n'):
            lower_line = line.lower()

            if 'organisation:' in lower_line or 'org:' in lower_line or 'organization:' in lower_line:
                if not any([excl_org in line for excl_org in exclude_orgs]):
                    Org.append(line.split(':')[1].strip())

            if 'org-name:' in lower_line or 'orgname:' in lower_line:
                OrgName.append(line.split(':')[1].strip())

            if 'email:' in lower_line or 'e-mail:' in lower_line:
                EmailServer.append(line.split('@')[1].strip())
                #print(line)
    
    except:
        print('whois '+site_IP+' process error/ran too long')
        
    #output = {'Org': list(set(Org))[0],
    #        'OrgName': list(set(OrgName))[0],
    #        'EmailServer': list(set(EmailServer))[0] }
    output = {'Org': list(set(Org)),
            'OrgName': list(set(OrgName)),
            'EmailServer': list(set(EmailServer)) }
    
    return output

In [19]:
findOrg('8.8.8.8')

{'Org': ['Level 3 Parent, LLC (LPL-141)', 'Google LLC (GOGL)'],
 'OrgName': ['Level 3 Parent, LLC', 'Google LLC'],
 'EmailServer': ['level3.com', 'google.com']}

NameError: name 'site_IP' is not defined

### separate valid IP websites and find organizations for these

In [17]:
sites_valid = sites[ sites['isValidIP'] ]
#sites_valid

In [18]:
sites_org = sites_valid['IP_addr'].apply(lambda s: pd.Series( findOrg(s) ) )

whois 186.192.90.5 process error/ran too long
whois 212.129.33.171 process error/ran too long
whois 82.165.229.87 process error/ran too long


In [19]:
sites_cdn = sites_valid.merge(sites_org, left_index=True, right_index=True )

In [35]:
sites_cdn.head(10)

Unnamed: 0,rank,url,IP_addr,isValidIP,Org,OrgName,EmailServer
0,1,google.com,172.217.167.46,True,[Google LLC (GOGL)],[Google LLC],[google.com]
1,2,youtube.com,172.217.167.46,True,[Google LLC (GOGL)],[Google LLC],[google.com]
2,3,facebook.com,157.240.25.35,True,"[Facebook, Inc. (THEFA-3)]","[Facebook, Inc.]",[fb.com]
3,4,baidu.com,220.181.57.216,True,[],[],"[bjtelecom.net, ns.chinanet.cn.net]"
4,5,wikipedia.org,103.102.166.224,True,[ORG-WFI1-AP],"[Wikimedia Foundation, Inc.]",[wikimedia.org]
5,6,qq.com,111.161.64.48,True,[],[],[chinaunicom.cn]
6,7,yahoo.com,98.137.246.7,True,[Yahoo! Inc. (YHOO)],[Yahoo! Inc.],"[cc.yahoo-inc.com, yahoo-inc.com]"
7,8,amazon.com,205.251.242.103,True,"[Amazon.com, Inc. (AMAZON-4)]","[Amazon.com, Inc.]","[amazonaws.com, amazon.com]"
8,9,taobao.com,140.205.94.189,True,[],[Asia Pacific Network Information Centre],"[apnic.net, cnnic.cn, alibaba-inc.com, list.al..."
9,10,reddit.com,151.101.1.140,True,"[Fastly (SKYCA-3), Administered by RIPE NCC]",[Fastly],[fastly.com]


### check organization, orgname, email against list of well known CDN names

In [4]:
# cdn names from wikipedia, www.cdnlist.com

CDN_names = sorted( list( pd.read_csv('CDNnames.csv', sep = '\n', header = None, names = ['CDN'])['CDN'].unique() ))

In [5]:
#TODO
#check org with cdnnames

In [10]:
with open('CDNnames.csv', 'r') as f:
    CDN_names = f.readline().strip()
CDN_names

"'AAPT',"

## Rank number of websites per CDN
- sort dataframe
- bar chart

# SAVE whois data

In [24]:
import socket
import subprocess

def getIP(s):
    try:
        IP = socket.gethostbyname(s)
        # blocked domains return IP 49.207.46.6, 49.207.46.24, 49.207.46.34 from ISP
        if IP in ['49.207.46.6', '49.207.46.24', '49.207.46.34']:
            print("Invalid site "+s)
            return False
        else:
            return IP
    except:
        print("Error processing site "+s)
        return False

df_sites = pd.read_csv('top-1m-new.csv', nrows=500, header = None, names = ['rank', 'site'])
df_sites['IP'] = df_sites['site'].apply(getIP)

Invalid site pornhub.com
Invalid site ok.ru
Invalid site livejasmin.com
Invalid site xvideos.com
Error processing site googleusercontent.com
Invalid site xhamster.com
Error processing site exosrv.com
Invalid site xnxx.com
Invalid site chaturbate.com
Invalid site yts.am
Invalid site youporn.com
Invalid site 1337x.to
Error processing site cloudfront.net
Invalid site redtube.com
Invalid site rutracker.org
Error processing site banvenez.com
Error processing site bp.blogspot.com
Error processing site exdynsrv.com
Invalid site sex.com
Error processing site wixsite.com


In [26]:
df_valid = df_sites[df_sites['IP'] != False]
len(df_valid)

480

In [46]:
site_to_IP = df_valid.set_index('site')['IP'].to_dict()

In [60]:
def saveWhoIs(site, IP):
    try:
        out2 = subprocess.check_output(['whois', IP], stderr=subprocess.STDOUT,
                                           timeout=10.0).decode('UTF-8', 'ignore')
        fout = open('output/whois/'+site, 'w')
        fout.write(out2)
        fout.close()
        print ("done whois for site %s IP %s" % (site, IP))
    except:
        print("error running whois for site %s IP %s" % (site, IP))
    return

In [56]:
for k,v in site_to_IP.items():
    saveWhoIs(k,k)

done whois for site google.com IP google.com
done whois for site youtube.com IP youtube.com
done whois for site facebook.com IP facebook.com
done whois for site baidu.com IP baidu.com
done whois for site wikipedia.org IP wikipedia.org
done whois for site qq.com IP qq.com
done whois for site yahoo.com IP yahoo.com
done whois for site amazon.com IP amazon.com
done whois for site taobao.com IP taobao.com
done whois for site reddit.com IP reddit.com
done whois for site tmall.com IP tmall.com
done whois for site google.co.in IP google.co.in
done whois for site twitter.com IP twitter.com
done whois for site live.com IP live.com
done whois for site sohu.com IP sohu.com
done whois for site jd.com IP jd.com
done whois for site yandex.ru IP yandex.ru
done whois for site google.co.jp IP google.co.jp
done whois for site instagram.com IP instagram.com
done whois for site weibo.com IP weibo.com
done whois for site sina.com.cn IP sina.com.cn
done whois for site 360.cn IP 360.cn
done whois for site lo

done whois for site china.com.cn IP china.com.cn
done whois for site google.nl IP google.nl
error running whois for site google.com.pe IP google.com.pe
done whois for site people.com.cn IP people.com.cn
done whois for site google.com.ph IP google.com.ph
done whois for site freepik.com IP freepik.com
done whois for site aliyun.com IP aliyun.com
done whois for site discordapp.com IP discordapp.com
done whois for site shutterstock.com IP shutterstock.com
done whois for site i62e2b4mfy.com IP i62e2b4mfy.com
done whois for site canva.com IP canva.com
done whois for site douban.com IP douban.com
done whois for site scribd.com IP scribd.com
done whois for site cnet.com IP cnet.com
done whois for site patria.org.ve IP patria.org.ve
done whois for site indiatimes.com IP indiatimes.com
done whois for site tvbs.com.tw IP tvbs.com.tw
done whois for site jianshu.com IP jianshu.com
done whois for site sogou.com IP sogou.com
done whois for site softonic.com IP softonic.com
done whois for site google.

done whois for site hola.com IP hola.com
done whois for site yao.tmall.com IP yao.tmall.com
done whois for site youdao.com IP youdao.com
done whois for site elpais.com IP elpais.com
done whois for site hotstar.com IP hotstar.com
done whois for site sourceforge.net IP sourceforge.net
done whois for site wordpress.org IP wordpress.org
done whois for site slickdeals.net IP slickdeals.net
done whois for site namu.wiki IP namu.wiki
done whois for site pixabay.com IP pixabay.com
done whois for site mercadolibre.com.mx IP mercadolibre.com.mx
done whois for site kaskus.co.id IP kaskus.co.id
done whois for site googlevideo.com IP googlevideo.com
done whois for site hclips.com IP hclips.com
done whois for site infourok.ru IP infourok.ru
done whois for site irctc.co.in IP irctc.co.in
done whois for site wease.im IP wease.im
done whois for site glassdoor.com IP glassdoor.com
done whois for site homedepot.com IP homedepot.com
done whois for site businessinsider.com IP businessinsider.com
done whois

In [53]:
for k,v in site_to_IP.items():
    saveWhoIs(k,v)

done whois for site google.com IP 216.58.221.46
done whois for site youtube.com IP 172.217.167.46
done whois for site facebook.com IP 157.240.25.35
done whois for site baidu.com IP 123.125.115.110
done whois for site wikipedia.org IP 103.102.166.224
done whois for site qq.com IP 111.161.64.40
done whois for site yahoo.com IP 98.137.246.8
done whois for site amazon.com IP 176.32.103.205
done whois for site taobao.com IP 140.205.220.96
done whois for site reddit.com IP 151.101.1.140
done whois for site tmall.com IP 140.205.130.99
done whois for site google.co.in IP 172.217.161.3
error running whois for site twitter.com IP 104.244.42.1
done whois for site live.com IP 204.79.197.212
done whois for site sohu.com IP 123.125.116.28
done whois for site jd.com IP 120.52.148.118
done whois for site yandex.ru IP 5.255.255.70
done whois for site google.co.jp IP 172.217.166.227
done whois for site instagram.com IP 34.228.147.246
done whois for site weibo.com IP 114.134.80.162
done whois for site si

error running whois for site uol.com.br IP 200.147.67.142
error running whois for site globo.com IP 186.192.90.5
error running whois for site flipkart.com IP 163.53.78.128
error running whois for site wetransfer.com IP 54.77.241.182
error running whois for site swiftviz.net IP 104.18.35.36
error running whois for site mercadolivre.com.br IP 52.84.100.212
error running whois for site google.se IP 172.217.167.3
error running whois for site godaddy.com IP 208.109.192.70
error running whois for site google.gr IP 172.217.160.227
error running whois for site quizlet.com IP 104.16.18.221
error running whois for site sciencedirect.com IP 18.214.105.86
error running whois for site mediafire.com IP 104.19.194.29
error running whois for site google.com.co IP 172.217.160.227
error running whois for site caijing.com.cn IP 124.243.192.30
error running whois for site gearbest.com IP 104.70.96.176
error running whois for site 163.com IP 123.58.180.8
error running whois for site china.com.cn IP 202.130

error running whois for site namnak.com IP 185.49.84.250
error running whois for site yy.com IP 221.228.79.225
error running whois for site hp.com IP 15.72.164.74
error running whois for site google.no IP 172.217.166.195
error running whois for site ladbible.com IP 104.16.4.248
error running whois for site myway.com IP 74.113.233.77
error running whois for site oracle.com IP 137.254.120.50
error running whois for site okta.com IP 23.21.220.245
error running whois for site foxnews.com IP 23.46.60.36
error running whois for site tripadvisor.com IP 192.229.189.15
error running whois for site telegram.org IP 149.154.167.99
error running whois for site abs-cbn.com IP 104.16.117.25
error running whois for site varzesh3.com IP 94.182.163.52
error running whois for site google.co.il IP 172.217.161.3
error running whois for site zoom.us IP 52.202.62.235
error running whois for site academia.edu IP 46.137.210.247
error running whois for site gfycat.com IP 52.84.107.37
error running whois for sit

error running whois for site marca.com IP 193.110.128.82
error running whois for site smallpdf.com IP 52.84.101.221
error running whois for site bitly.com IP 67.199.248.14
error running whois for site nike.com IP 146.197.184.71
error running whois for site bloomberg.com IP 69.191.252.148
error running whois for site cnbc.com IP 144.121.138.34
error running whois for site libero.it IP 213.209.17.209
error running whois for site duba.com IP 119.29.42.130
error running whois for site egy.best IP 104.25.237.22
error running whois for site surveymonkey.com IP 64.191.16.50
error running whois for site playstation.com IP 209.200.152.198
error running whois for site 9gag.com IP 151.101.194.133
error running whois for site google.com.kw IP 172.217.166.227
error running whois for site crptentry.com IP 93.93.53.190
error running whois for site usatoday.com IP 159.54.242.176
error running whois for site theverge.com IP 151.101.1.52
error running whois for site seznam.cz IP 77.75.77.39
error runnin

# Search whois data for Org and Emails

In [89]:
import os

def loadwhoisIP(site):
    file = 'output/whoisIP/'+site
    if os.path.exists(file):
        f = open(file, 'r')
        whoisIP = f.read()
        f.close()
        return whoisIP
    else:
        return None

def loadwhoissite(site):
    file = 'output/whoissite/'+site
    if os.path.exists(file):
        f = open(file, 'r')
        whoissite = f.read()
        f.close()
        return whoissite
    else:
        return None

In [94]:
# search orgs for both whois IP and whois site
# search email only for whois IP

def searchOrg(whoisdata):
    
    Org = []
    OrgKeywords = ['organisation:', 'org:', 'organization:', 'org-name:', 'orgname:']
    exclude_orgs = [e.lower() for e in ['AFRINIC','APNIC','ARIN','LACNIC','IANA',
                'Asia Pacific Network Information Centre', 'Administered by RIPE NCC', 'RIPE NCC', 
                                    'Registration Association', 'VeriSign Global Registry Services'] ]
    if whoisdata is not None:
        for line in whoisdata.split('\n'):
            lower_line = line.lower()

            for keyword in OrgKeywords:
                if keyword in lower_line:
                    if not any([excl_org in lower_line for excl_org in exclude_orgs]):
                        org = line.split(':')[1].strip()
                        if not org in Org:
                            Org.append(org)
    return Org

def searchEmail(whoisdata):
    Email = []
    EmailKeywords = ['@', 'email:', 'mailbox:', 'e-mail:']
    #exclude_emails = [e.lower() for e in ['verisign-grs.com', 'verisigninc.com', 'Registrar'] ]

    if whoisdata is not None:
        for line in whoisdata.split('\n'):
            lower_line = line.lower()

            for keyword in EmailKeywords:
                if keyword in lower_line:
                    email = line.split('@')[1].strip()
                    if not email in Email:
                        Email.append(email)
                    
    return Email

In [95]:
for site,IP in site_to_IP.items():
    
    whois1 = loadwhoisIP(site)
    Org1 = searchOrg(whois1)
    Email = searchEmail(whois1)
    
    whois2 = loadwhoissite(site)
    Org2 = searchOrg(whois2)
    
    print(site, IP, Org1, Org2, Email)

google.com 216.58.221.46 ['Google LLC (GOGL)', 'Google LLC'] ['Google LLC'] ['google.com']
youtube.com 172.217.167.46 ['Google LLC (GOGL)', 'Google LLC'] ['Google LLC'] ['google.com']
facebook.com 157.240.25.35 ['Facebook, Inc. (THEFA-3)', 'Facebook, Inc.'] ['Facebook, Inc.'] ['facebook.com']
baidu.com 123.125.115.110 [] ['Beijing Baidu Netcom Science Technology Co., Ltd.'] ["chinaunicom.cn'", 'baidu.com.cn']
wikipedia.org 103.102.166.224 ['ORG-WFI1-AP', 'Wikimedia Foundation, Inc.'] ['Public Interest Registry (PIR)', 'Afilias', 'Wikimedia Foundation, Inc.'] ["wikimedia.org'", 'wikimedia.org']
qq.com 111.161.64.40 [] ['Shenzhen Tencent Computer Systems CO.,Ltd'] ["chinaunicom.cn'", 'chinaunicom.cn']
yahoo.com 98.137.246.8 ['Yahoo! Inc. (YHOO)', 'Yahoo! Inc.'] ['Oath Inc.'] ['yahoo-inc.com', 'cc.yahoo-inc.com']
amazon.com 176.32.103.205 [] ['Amazon Technologies, Inc.'] ["amazon.com'"]
taobao.com 140.205.220.96 [] ['Zhejiang Taobao Network Limited (浙江淘宝网络有限公司)'] ['apnic.net', "cnnic.cn'"

i62e2b4mfy.com 188.42.139.92 ['ORG-SB387-RIPE', 'Servers.com B.V.'] ['REDACTED FOR PRIVACY'] ["servers.com'", 'servers.com"']
canva.com 104.16.81.22 ['Cloudflare, Inc. (CLOUD14)', 'Cloudflare, Inc.'] ['Canva'] ['cloudflare.com']
douban.com 154.8.131.172 ['ORG-TCCC1-AP', 'Tencent Cloud Computing (Beijing) Co., Ltd'] [] ["tencent.com'", 'tencent.com']
scribd.com 151.101.2.152 ['Fastly (SKYCA-3)', 'Fastly'] ['Scribd, Inc.'] ['fastly.com']
cnet.com 64.30.228.118 ['Sportsline.com (SPTL)', 'Sportsline.com'] ['CBS Interactive Inc.', 'CBS Technical Contact'] ['cbsinteractive.com']
patria.org.ve 190.205.112.68 [] ['Comisión Nacional de Telecomunicaciones (CONATEL)'] ['CANTV.NET']
indiatimes.com 223.165.27.146 [] ['REDACTED FOR PRIVACY'] ["indiatimes.co.in'", 'indiatimes.co.in']
tvbs.com.tw 52.84.110.192 ['Amazon Technologies Inc. (AT-88-Z)', 'Amazon Technologies Inc.'] ['Taiwan Network Information Center (TWNIC)'] ['amazonaws.com', 'amazon.com']
jianshu.com 107.150.101.156 ['Zenlayer Inc (ZENLA

googlevideo.com 172.217.161.4 ['Google LLC (GOGL)', 'Google LLC'] ['Google LLC'] ['google.com']
hclips.com 104.23.104.173 ['Cloudflare, Inc. (CLOUD14)', 'Cloudflare, Inc.'] ['Moniker Privacy Services'] ['cloudflare.com']
infourok.ru 85.119.149.18 ['ORG-SL223-RIPE'] ['Coordination Center for TLD RU', 'Technical Center of Internet', 'Infourok, LLC'] ["selectel.ru'"]
irctc.co.in 103.252.142.21 [] ['National Internet Exchange of India', 'Indian Railway Catering and Tourism Corporation Ltd'] ["cris.org.in'", 'cris.org.in']
wease.im 212.129.33.171 ['ORG-ONLI1-RIPE', 'ONLINE SAS'] ['Isle of Man Government', 'Information Systems Division,', 'Domicilium (IoM) Ltd'] ["online.net'", 'iliad-entreprises.fr']
glassdoor.com 104.16.234.151 ['Cloudflare, Inc. (CLOUD14)', 'Cloudflare, Inc.'] ['Glassdoor, Inc.'] ['cloudflare.com']
homedepot.com 35.201.95.83 ['Google LLC (GOOGL-2)', 'Google LLC'] ['Home Depot Product Authority, LLC'] ['google.com)', 'google.com']
businessinsider.com 151.101.65.171 ['Fastl

In [68]:
def compareOrg(org_list):
    # TODO if cdnname in org_list
    return cdn

In [69]:
def compareEmail(email_list):
    # TODO ???
    return cdn

# SCRAPING site and counting elements

In [8]:
import requests
from bs4 import BeautifulSoup
import re
import urllib

site = 'reddit.com'
url = 'https://www.'+site

In [184]:
response = requests.get(url)

In [185]:
header = response.headers
data = response.text

In [186]:
header

{'Cache-control': 'private, s-maxage=0, max-age=0, must-revalidate', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html; charset=utf-8', 'Set-Cookie': 'loid=00000000002rstfzs1.2.1544637114465.Z0FBQUFBQmNFVXE2U3BHa1JsSlF3ZVowOG5rMlBuaS14ZFdfa2t6S1h0TTY1WDM1d0V5RmFCdVhBMHFtV1lKVDNZZk9zeXhoTnVabF8zU1Y2OElZV25Vc0U2TDJ2U3lfM0JrQmh3UDBCU0YtcEZEclc0YUREVEZtVkN6dlM5VHNYdmhQdnFrbXhmakU; path=/; expires=Sat, 12 Dec 2020 17:51:54 GMT; domain=.reddit.com; secure, session_tracker=hAnQzrY5AohumUWupz.0.1544637114466.Z0FBQUFBQmNFVXE2NU1VeHlGY3FFZ3piRDF5VlZoY1JsaHhEaC1yQzRCRDhOeml2R2VZR09WdVZsMjBhN3FSXzNhZFd5cFpaSWhWcHo2RTZ0N2pYUGdUNExsckRaMjRySFVuamlFRTBrNjNWaFc3endXVkZBQXg0eVlFdDBmYlp0N1JVZUpqQjZlYWY; path=/; domain=.reddit.com; secure, rseor3=; path=/; expires=Fri, 11 Jan 2019 17:51:55 GMT; domain=.reddit.com;, rabt=; path=/; expires=Fri, 11 Jan 2019 17:51:55 GMT; domain=.reddit.com;, edgebucket=N96TeAGH5ak70DS5Fl; Domain=reddit.com; Max-Age=63071999; Path=/;  secure', 'X-Frame-Options': 'SAMEOR

In [187]:
soup = BeautifulSoup(data)

In [188]:
soup

<!DOCTYPE html>
<html lang="en"><head><script>
          var __SUPPORTS_TIMING_API = typeof performance === 'object' && !!performance.mark && !! performance.measure && !!performance.getEntriesByType;
          function __perfMark(name) { __SUPPORTS_TIMING_API && performance.mark(name); };
          var __firstLoaded = false;
          function __markFirstPostVisible() {
            if (__firstLoaded) { return; }
            __firstLoaded = true;
            __perfMark("first_post_title_image_loaded");
          }
        </script><script>
          __perfMark('head_tag_start');
        </script><title>reddit: the front page of the internet</title><meta charset="utf-8"/><meta content="noindex,nofollow" name="robots"/><meta content="width=device-width, initial-scale=1" name="viewport"/><style>
  /* http://meyerweb.com/eric/tools/css/reset/
    v2.0 | 20110126
    License: none (public domain)
  */

  html, body, div, span, applet, object, iframe,
  h1, h2, h3, h4, h5, h6, p, blockquote, 

In [203]:
urllib.parse.urlparse('http://google.com').netloc

'google.com'

In [190]:
links = []
 
for link in soup.findAll('a', attrs={'href': re.compile("^https://")}):
    links.append(link.get('href'))
 
print(links)

['https://www.reddit.com/login?dest=https%3A%2F%2Fwww.reddit.com%2F', 'https://www.reddit.com/register?dest=https%3A%2F%2Fwww.reddit.com%2F', 'https://old.reddit.com/', 'https://about.reddit.com', 'https://about.reddit.com/careers/', 'https://about.reddit.com/press/', 'https://about.reddit.com/advertise/', 'https://www.reddithelp.com', 'https://www.reddit.com/mobile/download', 'https://www.reddit.com/coins', 'https://www.reddit.com/premium', 'https://www.reddit.com/help/contentpolicy', 'https://www.reddit.com/help/privacypolicy', 'https://www.reddit.com/help/useragreement', 'https://www.reddit.com/help/healthycommunities/']


In [191]:
tags = soup.findAll('img')
for tag in tags:
    print(tag.get('src'))

In [192]:
tags = soup.findAll('script')
for tag in tags:
    print(tag.get('src'))

None
None
None
None
https://www.redditstatic.com/desktop2x/js/ads.js
None
None
None
https://www.redditstatic.com/desktop2x/Legacy~runtime~Reddit.d9a640c15c0683d03615.js
https://www.redditstatic.com/desktop2x/Legacy~RedesignFonts.fea81f6921728cd9abf4.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Gifts~GovStandalonePoll~Reddit~RedesignChat.bba15f317236ad691696.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Gifts~Governance~Reddit.5c6af9634805c6ac543e.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Chat~Client~Governance~Reddit.e0017fa18113d5422f34.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Client~Governance~Reddit.9b413dfba08533351cf6.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Gifts~GovStandalonePoll~Reddit.706dcd356b4052f2fe3a.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Governance~Reddit.09ddea80624da2102063.js
https://www.redditstatic.com/desktop2x/Legacy~vendors~Reddit.633c3a6a56136b343b05.j

In [193]:
tags = soup.findAll('link')
for tag in tags:
    print(tag.get('href'))

https://www.reddit.com
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-57x57.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-60x60.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-72x72.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-76x76.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-114x114.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-120x120.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-144x144.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-152x152.png
https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-180x180.png
https://www.redditstatic.com/desktop2x/img/favicon/android-icon-192x192.png
https://www.redditstatic.com/desktop2x/img/favicon/favicon-32x32.png
https://www.redditstatic.com/desktop2x/img/favicon/favicon-96x96.png
https://www.redditstatic.com/desktop2x/img/favicon/favicon-16x16.png
https://www.redditstatic.com/desktop

In [194]:
tags = soup.findAll('a')
for tag in tags:
    print(tag.get('href'))

/
/
/r/all
/original
https://www.reddit.com/login?dest=https%3A%2F%2Fwww.reddit.com%2F
https://www.reddit.com/register?dest=https%3A%2F%2Fwww.reddit.com%2F
https://old.reddit.com/
/hot/
/new/
/controversial/
/top/
/rising/
/submit
https://about.reddit.com
https://about.reddit.com/careers/
https://about.reddit.com/press/
https://about.reddit.com/advertise/
http://www.redditblog.com/
https://www.reddithelp.com
https://www.reddit.com/mobile/download
https://www.reddit.com/coins
https://www.reddit.com/premium
http://redditgifts.com/
https://www.reddit.com/help/contentpolicy
https://www.reddit.com/help/privacypolicy
https://www.reddit.com/help/useragreement
https://www.reddit.com/help/healthycommunities/


In [195]:
tags = soup.findAll('meta')
for tag in tags:
    print(tag.get('content'))

None
noindex,nofollow
width=device-width, initial-scale=1
600
reddit
@reddit
summary
reddit
reddit
https://www.redditstatic.com/icon.png
website
https://www.reddit.com/
https://www.redditstatic.com/icon.png
256
256
#ffffff
https://www.redditstatic.com/desktop2x/img/favicon/ms-icon-144x144.png
#ffffff
None


### Count (static) objects
- img src
- script src
- meta content
- a href
- use urlparser to find netloc and count netloc before comparison

In [215]:
netlocs_static = []
netlocs_all = []

def getnetloc(loc):
    if loc is None or loc is '':
        return None
    return urllib.parse.urlparse(loc).netloc
    
# static
tags = soup.findAll('script')
for tag in tags:
    loc = tag.get('src')
    netlocs_static.append( getnetloc(loc) )
    
tags = soup.findAll('img')
for tag in tags:
    loc = tag.get('src')
    netlocs_static.append( getnetloc(loc) )
    
tags = soup.findAll('a')
for tag in tags:
    loc = tag.get('href')
    netlocs_static.append( getnetloc(loc) )
    
# all
tags = soup.findAll('link')
for tag in tags:
    loc = tag.get('href')
    netlocs_all.append( getnetloc(loc) )
    
tags = soup.findAll('meta')
for tag in tags:
    loc = tag.get('content')
    netlocs_all.append( getnetloc(loc) )

In [219]:
from collections import Counter

Counter(netlocs_static)

Counter({None: 7, 'www.redditstatic.com': 29})

In [230]:
Counter(netlocs_all)

Counter({'www.reddit.com': 11,
         'www.redditstatic.com': 40,
         '': 23,
         'old.reddit.com': 1,
         'about.reddit.com': 4,
         'www.redditblog.com': 1,
         'www.reddithelp.com': 1,
         'redditgifts.com': 1,
         None: 2})

In [232]:
Counter(netlocs_all).most_common(1)

[('www.redditstatic.com', 40)]

In [41]:
from bs4 import BeautifulSoup
from collections import Counter

def getnetloc(loc):
        if loc is None or loc is "":  # empty src
            return None
        return urllib.parse.urlparse(loc).netloc

def count_netlocs(data):
    soup = BeautifulSoup(data)
    
    netlocs_static = []
    #netlocs_all = []

    # static
    tags = soup.findAll('script')
    for tag in tags:
        loc = tag.get('src')
        netlocs_static.append( getnetloc(loc) )

    tags = soup.findAll('img')
    for tag in tags:
        loc = tag.get('src')
        netlocs_static.append( getnetloc(loc) )
        #loc = tag.get('srcset')
        #netlocs_static.append( getnetloc(loc) )
        #loc = tag.get('data-srcset')
        #netlocs_static.append( getnetloc(loc) )
        
    tags = soup.findAll('a')
    for tag in tags:
        loc = tag.get('href')
        netlocs_static.append( getnetloc(loc) )
    
    """
    # all
    tags = soup.findAll('link')
    for tag in tags:
        loc = tag.get('href')
        netlocs_all.append( getnetloc(loc)

    tags = soup.findAll('meta')
    for tag in tags:
        loc = tag.get('content')
        netlocs_all.append( getnetloc(loc) )
    """  
    return Counter(netlocs_static)
    

def fetch_url(site):
    #url = 'http://'+site
    url = 'https://www.'+site+'/'
    try:
        response = requests.get(url, headers={'Accept-Encoding': 'identity'}, timeout = 10.0)
        
        if response.ok:
            print('Successfully fetched '+site)
            data = response.text
            return data
        else:
            print('Problem fetching %s with response code %r' %(site, response.status_code) )
            return False
    
    except Exception as e:
        print("Error fetching %r: Exception %s" % (site, e))
        return False

In [42]:
#sites_list = list(sites['site'])
counter = {}
bad_site = []
sites_list = ['princeton.edu']

for site in sites_list:
    
    file = 'output/homepage/'+site
    if os.path.exists(file):
        with open(file, 'r') as f:
            data = f.read()
            cnt = count_netlocs( data )
            del cnt[None]  # remove None element
            counter[site] = cnt.most_common()  # sort Counter
    else:
        data = fetch_url(site)
        if data:
            fout = open('output/homepage/'+site, 'w')
            fout.write(data)
            fout.close()
            cnt = count_netlocs( data )
            del cnt[None]  # remove None element
            counter[site] = cnt.most_common()  # sort Counter
        else:
            bad_site.append(site)

In [43]:
counter

{'princeton.edu': [('', 120),
  ('admission.princeton.edu', 3),
  ('registrar.princeton.edu', 2),
  ('giving.princeton.edu', 2),
  ('socialmedia.princeton.edu', 2),
  ('assets.juicer.io', 1),
  ('finaid.princeton.edu', 1),
  ('alumni.princeton.edu', 1),
  ('dof.princeton.edu', 1),
  ('careers.princeton.edu', 1),
  ('accessibility.princeton.edu', 1),
  ('www.goprincetontigers.com', 1),
  ('library.princeton.edu', 1),
  ('www.facebook.com', 1),
  ('twitter.com', 1),
  ('www.instagram.com', 1),
  ('go.snapchat.com', 1),
  ('www.linkedin.com', 1),
  ('www.youtube.com', 1),
  ('www.princeton.edu', 1)]}

# <font color='red'> TODO in order </font>
1. regex: domain vs cdn_domains
    - eg fbcdn.net, googleusercontent.com, etc
2. regex: (count>1) OR (count.top3) vs cdn_domains
    - https://github.com/Hossein-Doroud/cdn-detector/blob/master/cdnDetector.py
    - https://github.com/WPO-Foundation/webpagetest/blob/master/agent/wpthook/cdn.h
3. whois Org vs cdn_names



## Blocked, redirects, and unreachable
- Blocked lookups at ISP
    - pornhub.com, livejasmin.com, xvideos.com, xhamster.com, xnxx.com, chaturbate.com, youporn.com, redtube.com, sex.com, 
    - yts.am : movie torrents
    - ok.ru
    - 1337x.to : movie torrents
    - rutracker.org : torrents
- Blocked malware (403 Forbidden)
    - exosrv.com : malware/adware
    - swiftviz.net : virus redirect/malware
    - i62e2b4mfy.com : malware/adware
    - vinfdv6b4j.com : virus redirect
    - s9kkremkr0.com : virus redirect
    - omumultation.club : virus popups
    - nextoptim.com : adware redirect popups
    - resentaticexhaus.info : possible malware, IP timeouts on ping, not 403
    - exdynsrv.com : adware redirect/malware, not 403
    - hotchedmothe.club: malware popup ads
    - digitaldsp.com : redirect virus, not 403
    - jf71qh5v14.com : virus/adware
    -
- No lookup or redirect
    - banvenez.com > e-bdvcpx.banvenez.com exists but can't be reached automatically
    
    
- Unreachable servers (possible CDNs)
    - microsoftonline.com : port 80 and 443 are probably closed
    - googleusercontent.com :  port 80 and 443 are probably closed
    - twimg.com : Twitter CDN by Twitter (twimg.com) blocked on port 80 and 443
    - cloudfront.net > hosted on Amazon aws as part of Amazon Cloudfront CDN. Blocked port 80 and 443. No DNS lookup.
    - bp.blogspot.com > hosts resources for blogspot. Blocked 80 443. Probably not CDN just a simple direct server.
        - https://2-bp.blogspot.com/2017/11/what-are-bpblogspotcom-links-and-how.html 
        
    
- Redirects
    - wikia.com > wikia.com/fandom
    - zhihu.com > zhihu.com/signup
    - fandom.com > fandom.wikia.com
        - contains fastly-insights.com widget scripts
    - rednet.cn > can't really reach this page on browser either
        - downloads a 200 response page saying redirect
     - ci123.com > same as above
         - 200 response redirect page that is very slow on browser was manually downloaded to ci123.com.html
- No curl (timeouts with curl not with requests)
    - usps.com
    - bestbuy.com
    - udemy.com > robot check
    - momoshop.com.tw > different source when viewed online vs by curl or requests
        - 1 small asia.creativecdn.com in script tag on end of page, most sources on momoshop.com.tw
        - manually download main page from browser as momoshop.com.tw.html
    - leboncoin.fr > blocked 403 when using requests/curl but available on browser
        - manually download main page from browser as leboncoin.fr.html
    - kissanime.ru > Please wait 5 seconds... doesn't work with minibrowsers
        - manually download main page from browser as kissanime.ru.html
    - roblox.com > gaming site that prechecks for cookies and scripts and tells not working with 200 code
        - manually download main page from browser as roblox.com.html
- Others
    - jianshu.com > cdn2.jianshu.com in < link href >
    - shopify.com > cdn.shopify.com in < link href >
    - xfinity.com > cdn.comcast.com
    - investing.com  > link href akamaized.net
    - discordapp.cdn > has a field CDN_HOST: 'cdn.discordapp.com' but not in static sources
    - bodelen.com > undetected redirect adware virus shows google page
    

In [327]:
bad_site


['reddit.com',
 'microsoftonline.com',
 'pornhub.com',
 'ok.ru',
 'livejasmin.com',
 'xvideos.com',
 'googleusercontent.com',
 'xhamster.com',
 'wikia.com',
 'zhihu.com',
 'exosrv.com',
 'xnxx.com',
 'swiftviz.net',
 'godaddy.com',
 'chaturbate.com',
 'i62e2b4mfy.com',
 'jianshu.com',
 'twimg.com',
 'vinfdv6b4j.com',
 'yts.am',
 's9kkremkr0.com',
 'omumultation.club',
 'usps.com',
 'youporn.com',
 'myshopify.com',
 'bestbuy.com',
 'udemy.com',
 '1337x.to',
 'momoshop.com.tw',
 'resentaticexhaus.info',
 'cloudfront.net',
 'nextoptim.com',
 'redtube.com',
 'fandom.com',
 'americanexpress.com',
 'namnak.com',
 'oracle.com',
 'rutracker.org',
 'banvenez.com',
 'bp.blogspot.com',
 'xfinity.com',
 'asos.com',
 'glassdoor.com',
 'homedepot.com',
 'exdynsrv.com',
 'sex.com',
 'ign.com',
 'gamespot.com',
 'leboncoin.fr',
 'chegg.com',
 'macys.com',
 'gismeteo.ru',
 'investing.com',
 'huffingtonpost.com',
 'hotchedmothe.club',
 'playstation.com',
 'kissanime.ru',
 'digitaldsp.com',
 'jf71qh5v14.

In [44]:
def try_bad_site(site):
    #site = 'souq.com'
    url = 'https://www.'+site
    
    s = requests.Session()
    s.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
    #s.headers['User-Agent'] = 'abc'
    res = s.get(url, headers={'Accept-Encoding': 'identity'}, timeout = 10.0)
    s.close()
    
    print(res.status_code)
    
    data = res.text
    
    return data

In [45]:
data = try_bad_site('tribunnews.com')

200


In [46]:
fout = open('output/homepage/'+site, 'w')
fout.write(data)
fout.close()