# ASN and CDN lookup
- Use ipasn library to download latest routeviews data and lookup ASN for particular IP
- other options include cymru queries (no dependency use command line)

In [6]:
from __future__ import division
%pylab inline
import pandas as pd
import numpy as np
import subprocess
import os, sys, re
from collections import defaultdict
import socket

Populating the interactive namespace from numpy and matplotlib


In [120]:
df_sites = pd.read_csv('top-1m-new.csv', nrows=500, header = None, names = ['rank', 'site'])

In [122]:
def getIP(s):
    try:
        IP = socket.gethostbyname(s)
        # blocked domains return IP 49.207.46.6, 49.207.46.24, 49.207.46.34 from ISP
        if IP in ['49.207.46.6', '49.207.46.24', '49.207.46.34']:
            print("Blocked site "+s)
            return False
        else:
            return IP
    except:
        print("Error accessing site "+s)
        return False

df_sites['IP'] = df_sites['site'].apply(getIP)

Blocked site pornhub.com
Blocked site ok.ru
Blocked site livejasmin.com
Blocked site xvideos.com
Error accessing site googleusercontent.com
Blocked site xhamster.com
Error accessing site exosrv.com
Blocked site xnxx.com
Blocked site chaturbate.com
Blocked site yts.am
Blocked site youporn.com
Blocked site 1337x.to
Error accessing site cloudfront.net
Blocked site redtube.com
Blocked site rutracker.org
Error accessing site banvenez.com
Error accessing site bp.blogspot.com
Error accessing site exdynsrv.com
Blocked site sex.com
Error accessing site wixsite.com


In [125]:
df_sites.head()

Unnamed: 0,rank,site,IP
0,1,google.com,216.58.221.46
1,2,youtube.com,172.217.167.46
2,3,facebook.com,157.240.25.35
3,4,baidu.com,123.125.115.110
4,5,wikipedia.org,103.102.166.224


In [126]:
import pyasn

asndb = pyasn.pyasn('output/ipasn_20181212.dat')  #downloaded pyasn_util_download.py --latest
asndb.lookup('8.8.8.8')

(15169, '8.8.8.0/24')

In [127]:
def findASN(ip):
    if ip:
        return asndb.lookup(ip)[0]
    else:
        return False
    
df_sites['ASN'] = df_sites['IP'].apply(findASN)

In [128]:
df_sites.head()

Unnamed: 0,rank,site,IP,ASN
0,1,google.com,216.58.221.46,15169
1,2,youtube.com,172.217.167.46,15169
2,3,facebook.com,157.240.25.35,32934
3,4,baidu.com,123.125.115.110,4808
4,5,wikipedia.org,103.102.166.224,14907


In [129]:
df_valid = df_sites[df_sites['IP'] != False]

In [130]:
len(df_valid)

480

## CDN finder
- use whois queries to find Organization names and emails as they are related to CDNs


In [116]:
def findOrg(site_IP):
    # use whois <IPADDR> to find and parse the organization and email server of site_IP
    # return dict {Org, OrgName, Email}
    
    Org = []
    OrgKeywords = ['organisation:', 'org:', 'organization:', 'org-name:', 'orgname:']
    Email = []
    EmailKeywords = ['@', 'email:', 'mailbox:', 'e-mail:']
    exclude_orgs = ['AFRINIC','APNIC','ARIN','LACNIC','IANA',
                    'Asia Pacific Network Information Centre', 'Administered by RIPE NCC', 'RIPE NCC']
    
    try:
        out2 = subprocess.check_output(['whois', site_IP], stderr=subprocess.STDOUT,
                                       timeout=10.0).decode('UTF-8', 'ignore')

        for line in out2.split('\n'):
            lower_line = line.lower()

            for keyword in OrgKeywords:
                if keyword in lower_line:
                    if not any([excl_org in line for excl_org in exclude_orgs]):
                        org = line.split(':')[1].strip()
                        if not org in Org:
                            Org.append(org)

            for keyword in EmailKeywords:
                if keyword in lower_line:
                    email = line.split('@')[1].strip()
                    if not email in Email:
                        Email.append(email)
            #print(line)
    
    except:
        print('whois '+site_IP+' process error/ran too long')

    output = {'Org': Org, 'Email': Email }
    
    return output

In [117]:
#sites_org = df_valid['IP'].apply(lambda s: pd.Series( findOrg(s) ) )
#sites_org.to_pickle('output/df_org_email_info.pkl')

In [58]:
sites_org = pd.read_pickle('output/df_org_email_info.pkl')

### Merge CDN org info with ASN and IP info

In [131]:
sites_cdn = df_valid.merge(sites_org, left_index=True, right_index=True )

In [154]:
sites_cdn.head()

Unnamed: 0,rank,site,IP,ASN,Org,Email,numOrg,numEmail
0,1,google.com,216.58.221.46,15169,"[Google LLC (GOGL), Google LLC]",[google.com],2,1
1,2,youtube.com,172.217.167.46,15169,"[Google LLC (GOGL), Google LLC]",[google.com],2,1
2,3,facebook.com,157.240.25.35,32934,"[Facebook, Inc. (THEFA-3), Facebook, Inc.]",[fb.com],2,1
3,4,baidu.com,123.125.115.110,4808,[],"[ns.chinanet.cn.net', ns.chinanet.cn.net, bjte...",0,3
4,5,wikipedia.org,103.102.166.224,14907,"[ORG-WFI1-AP, Wikimedia Foundation, Inc.]","[wikimedia.org', wikimedia.org]",2,2


In [133]:
sites_cdn['numOrg'] = sites_cdn['Org'].apply(len)
sites_cdn['numEmail'] = sites_cdn['Email'].apply(len)

#### Check if there are valid IPs with no available data in email servers and organization names fields

In [143]:
sites_cdn[sites_cdn['numEmail']==0]

Unnamed: 0,rank,site,IP,ASN,Org,Email,numOrg,numEmail


In [144]:
sites_cdn

Unnamed: 0,rank,site,IP,ASN,Org,Email,numOrg,numEmail
0,1,google.com,216.58.221.46,15169,"[Google LLC (GOGL), Google LLC]",[google.com],2,1
1,2,youtube.com,172.217.167.46,15169,"[Google LLC (GOGL), Google LLC]",[google.com],2,1
2,3,facebook.com,157.240.25.35,32934,"[Facebook, Inc. (THEFA-3), Facebook, Inc.]",[fb.com],2,1
3,4,baidu.com,123.125.115.110,4808,[],"[ns.chinanet.cn.net', ns.chinanet.cn.net, bjte...",0,3
4,5,wikipedia.org,103.102.166.224,14907,"[ORG-WFI1-AP, Wikimedia Foundation, Inc.]","[wikimedia.org', wikimedia.org]",2,2
5,6,qq.com,111.161.64.40,4837,[],"[chinaunicom.cn', chinaunicom.cn]",0,2
6,7,yahoo.com,98.137.246.8,36647,"[Yahoo! Inc. (YHOO), Yahoo! Inc.]","[yahoo-inc.com, cc.yahoo-inc.com]",2,2
7,8,amazon.com,176.32.103.205,16509,"[Amazon.com, Inc. (AMAZON-4), Amazon.com, Inc.]","[amazon.com, amazonaws.com]",2,2
8,9,taobao.com,140.205.220.96,37963,[],"[apnic.net, cnnic.cn', cnnic.cn, alibaba-inc.c...",0,6
9,10,reddit.com,151.101.1.140,54113,"[Fastly (SKYCA-3), Fastly]",[fastly.com],2,1


## Match CDN name with sites_cdn['Org']

In [155]:
with open('CDNnames.csv', 'r') as f:
    CDN_names = f.read().splitlines()

In [176]:
def matchCDN(org_list):
    if len(org_list)>0:
        for org in org_list:
            org_lower = org.lower()
            for cdn in CDN_names:
                cdn_lower = cdn.lower()
                if (cdn_lower in org_lower):
                    return cdn
                
    else:
        return None
    

In [177]:
sites_cdn['CDN'] = sites_cdn['Org'].apply(matchCDN)

In [178]:
sites_cdn[(sites_cdn['CDN'].isnull())]

Unnamed: 0,rank,site,IP,ASN,Org,Email,numOrg,numEmail,CDN
2,3,facebook.com,157.240.25.35,32934,"[Facebook, Inc. (THEFA-3), Facebook, Inc.]",[fb.com],2,1,
3,4,baidu.com,123.125.115.110,4808,[],"[ns.chinanet.cn.net', ns.chinanet.cn.net, bjte...",0,3,
4,5,wikipedia.org,103.102.166.224,14907,"[ORG-WFI1-AP, Wikimedia Foundation, Inc.]","[wikimedia.org', wikimedia.org]",2,2,
5,6,qq.com,111.161.64.40,4837,[],"[chinaunicom.cn', chinaunicom.cn]",0,2,
6,7,yahoo.com,98.137.246.8,36647,"[Yahoo! Inc. (YHOO), Yahoo! Inc.]","[yahoo-inc.com, cc.yahoo-inc.com]",2,2,
8,9,taobao.com,140.205.220.96,37963,[],"[apnic.net, cnnic.cn', cnnic.cn, alibaba-inc.c...",0,6,
10,11,tmall.com,140.205.130.99,37963,[],"[apnic.net, cnnic.cn', cnnic.cn, alibaba-inc.c...",0,6,
12,13,twitter.com,104.244.42.1,13414,"[Twitter Inc. (TWITT), Twitter Inc.]",[twitter.com],2,1,
13,14,live.com,204.79.197.212,8068,"[Microsoft Corporation (MSFT), Microsoft Corpo...","[microsoft.com., microsoft.com]",2,2,
14,15,sohu.com,123.125.116.28,4808,[],"[chinaunicom.cn', chinaunicom.cn, publicf.bta....",0,3,


In [52]:
#sites_cdn.to_pickle('output/df_IP_ASN_CDN.pkl')

Unnamed: 0,rank,site,IP,ASN,Org,OrgName,EmailServer
0,1,google.com,216.58.221.46,15169,[Google LLC (GOGL)],[Google LLC],[google.com]
1,2,youtube.com,172.217.167.46,15169,[Google LLC (GOGL)],[Google LLC],[google.com]
2,3,facebook.com,157.240.25.35,32934,"[Facebook, Inc. (THEFA-3)]","[Facebook, Inc.]",[facebook.com]
3,4,baidu.com,220.181.57.216,23724,[],[],"[ns.chinanet.cn.net, bjtelecom.net]"
4,5,wikipedia.org,103.102.166.224,14907,[ORG-WFI1-AP],"[Wikimedia Foundation, Inc.]",[wikimedia.org]
5,6,qq.com,111.161.64.40,4837,[],[],[chinaunicom.cn]
6,7,yahoo.com,98.138.219.232,36646,[Yahoo! Inc. (YHOO)],[Yahoo! Inc.],"[cc.yahoo-inc.com, yahoo-inc.com]"
7,8,amazon.com,205.251.242.103,16509,"[Amazon.com, Inc. (AMAZON-4)]","[Amazon.com, Inc.]","[amazonaws.com, amazon.com]"
8,9,taobao.com,140.205.94.189,37963,[],[Asia Pacific Network Information Centre],"[alibaba-inc.com, cnnic.cn, aliyun-inc.com, li..."
9,10,reddit.com,151.101.129.140,54113,"[Fastly (SKYCA-3), Administered by RIPE NCC]",[Fastly],[fastly.com]


In [153]:
CDN_names

['AAPT',
 'ARA Networks',
 'AT&T',
 'AT&T Inc.',
 'Akamai',
 'Akamai Technologies',
 'Alibaba',
 'Allot Communications',
 'Amazon',
 'Amazon CloudFront',
 'Aryaka',
 'Azure CDN',
 'BT',
 'BT Group',
 'BTI Systems',
 'Bell',
 'Bell Canada',
 'BelugaCDN',
 'Bharti Airtel',
 'BitTorrent, Inc.',
 'Blue Coat',
 'BootstrapCDN',
 'Broadmedia',
 'Broadpeak',
 'CDN77',
 'CDNetworks',
 'CacheFly',
 'Cedexis',
 'CenterServ',
 'CenturyLink',
 'China Telecom',
 'ChinaCache',
 'ChinaNetCenter',
 'Cisco',
 'Cloudflare',
 'Comcast',
 'Concentric',
 'Concurrent',
 'Conversant',
 'Conviva',
 'Coral Content Distribution Network',
 'Cotendo',
 'Deutsche Telekom',
 'EdgeCast Networks',
 'Edgeware',
 'Ericsson',
 'Fastly',
 'Fortinet',
 'Google Cloud',
 'HP Cloud Services',
 'HiNet',
 'Hibernia Networks',
 'Highwinds',
 'Highwinds Network Group',
 'Hola',
 'Huawei',
 'Incapsula',
 'IneoQuest',
 'Instart',
 'Instart Logic',
 'Interferex',
 'Internap',
 'Interoute',
 'JSDelivr',
 'Jetstream',
 'Juniper ',
 'K