In [1]:
# Script to generate the new EU-Cookies project crawl list. 
# version 20180421 (based on version 2017-12-23 / 2017-10-24 / earlier)
# The new crawl list is (1) for approx 20 countries (>2.5m BB subs)  (was 30 earlier)
#                       (2) is chosen based on top sites per TLD 
#                       (3) as ranked by the Majestic million list (before: alexa, Cisco Umbrella) 

# output on: 2018-04-21

# update: 2018-05-24: if tld-extract flags it as TLD, don't use it. won't resolve. majestic error.

import pandas as pd
from collections import Counter
import tldextract
import json
from six.moves.urllib.parse import urlparse
import re
from pprint import pprint


In [2]:
countries = pd.read_excel('../indata/Cookie-Countries-201804.xlsx', index_col='CC')
n0 = len(countries)
countries = countries[countries['In-Study-18']==1]
n2 = len(countries[countries.EU_DP_status.isin(['EU','EEA'])&(countries.Broadband_2016>=1000000)]) 
print(n0, '=>', len(countries), '..', n2)  # 5 non-eu ; really small already removed

47 => 20 .. 15


In [10]:
len(countries)

20

In [7]:
top1m = pd.read_csv('../indata/majestic_million-20180421.csv', index_col='GlobalRank')

df = pd.DataFrame(columns=['tld', 'global_rank', 'rank', 'domain', 'cat'])


# for simplicity just .com/.org now
# - top tlds: .com, .org,! (plus .net, .io, ru, .br, .cn., as well as .tv .co .me, lastly .gov, .edu, .mobi, .biz...)
# - .NET make's lettile sense -- often not user-facing or have .com alts. 
# - beheavior of others might be too specfic
interesting_ggtlds = ('COM', 'ORG')

# collecting 200, although i'll be using 100
cctld_limit = 200
tld_collected = {}
skipped = []
# btw: with current config, a bunch countries had 199 at most
# stats from before (ubmbrella list)                
# - interesting, these small countries were at the end... ('EE': 76, 'LV': 88, 'SI': 78)
# - (I thought of removing re unbalanced and two no CG VPNs. but rob thought slovenia rules interesting)


print('majestic domains:', len(top1m))

for ix, row in top1m.iterrows():
    if ix % 200000 == 0:
        print(ix)
        
    tld, tld_rank, domain = row['TLD'].upper(), row['TldRank'], row['Domain']    
    if tld in countries.index or tld in interesting_ggtlds:
        limit = cctld_limit  if tld not in interesting_ggtlds else 2 * cctld_limit
        if len(tld_collected.setdefault(tld, [])) < limit:
            # majestic's full domain anmes are often better than what 'tldextract' returns
            # - e.g. plus.google.com vs google.com
            # - in four cases I'd say 'tldextract' is even wrong: gc.ca, waw.pl, ...
            # - in a few cases (e.g. blogspot) it is a question mark. but let's be consistent
            # - (if row.Domain != : print...)
            
            # if domain == 'gc.ca' or domain == 'waw.pl': raise
                
            if not tldextract.extract(domain).registered_domain:
                #print('skipping ', domain)
                skipped.append(domain)
                continue
            
            assert not any(df.domain == domain)  # sanity check no duplicates            
            df = df.append({'tld':tld, 'global_rank':ix, 'rank':tld_rank, 'domain':domain}, ignore_index=True)
            tld_collected[tld].append(domain)
            if len(tld_collected[tld]) == limit:
                print('Reached %d for %s on rank #%d site %s' % (limit, tld, ix, domain))

         
print('skipped: ', len(skipped), '\n\n', skipped, '\n\n')        
        
for t in set(df.tld):
    l = len(df[df.tld==t])  
    if l < cctld_limit: 
        print('not reached', t, l)
        
print('=> selected domains', len(df))

majestic domains: 1000000
Reached 400 for COM on rank #677 site prestashop.com
Reached 400 for ORG on rank #3622 site dreamwidth.org
Reached 200 for UK on rank #7141 site rmg.co.uk
Reached 200 for DE on rank #8861 site kicker.de
Reached 200 for JP on rank #12427 site skr.jp
Reached 200 for IT on rank #17465 site unical.it
Reached 200 for FR on rank #18057 site beteavone.fr
Reached 200 for AU on rank #23546 site alphalink.com.au
Reached 200 for CA on rank #25247 site mta.ca
Reached 200 for ES on rank #26210 site anadelgado.es
Reached 200 for PL on rank #28290 site blogx.pl
Reached 200 for NL on rank #34914 site koninklijkhuis.nl
Reached 200 for US on rank #43469 site campl.us
Reached 200 for SE on rank #67598 site yogamamas.se
Reached 200 for CH on rank #71868 site cash.ch
Reached 200 for CZ on rank #80357 site army.cz
Reached 200 for AT on rank #83100 site fjsoft.at
Reached 200 for GR on rank #84511 site viva.gr
Reached 200 for BE on rank #84844 site privacycommission.be
Reached 200 fo

In [6]:
tldextract.extract(domain).registered_domain

''

In [11]:
#df[df.tld=='SE']
#df2 = df.copy()
#ix

In [12]:
# let's load zack-w's cateogrization data
zj = json.load(open('../indata/zackw_urls_with_topics_20160223_data.json'))
zj = zj['urls']   # keys: 'clab_categories', 'sources', 'urls'
zu = {}
#dbg_map = {}
for u in zj:
    uu = urlparse(u).netloc.lower()  # get domain
    uu = re.sub('^www.', '', uu)  # remove starting www.
    # tldextract.extract(uu).registered_domain  # no: let's no confuse google.nl and books.google.nl        
    # if uu.endswith('google.nl'): DBG_google.add(uu)    
    #dbg_map[u] = uu
    l = zu.setdefault(uu, list())
    for dt, cats in zj[u]['access_results_usa'].items():
        l.append(tuple(cats))
print('zackw urls:', len(zu))

cnt_cat = Counter()
for u,l in zu.items():
    if type(l[0]) is tuple:
        top = Counter()
        for cats in l:
            for c in cats:
                top[c] += 1
        top = sorted(top.items(), key=lambda x:x[1], reverse=True)  # sort by category count
        for top1 in top:
            if not top1[0].startswith('error:') and not top1[0].startswith('junk:'):
                # get first, except if it's error/junk (error: 403, 404, 503, TLS error, crawler failure, ...)
                break  
        l.insert(0, top1[0])
        l.insert(1, top)
        cnt_cat[top1[0].split(':')[0]] += 1         
# pprint(cnt_cat)  # huh, not the best. 24k junk, 50k error; 51k porn :)
# for d in DBG_google: print('\n', '********', d, '********', '\n', zu[d])


('zackw urls:', 330358)


In [14]:
# now combine zack with our data

# now website type is sth else and might force me to use alexa-top-1m
for ix in df.index:
    s, tld = df.loc[ix, 'domain'], df.loc[ix, 'tld']    
    if s in zu:
        # if tld=='NL':
        #    pprint(s, zu[s][1])
        # so news + education are fine; 
        # but: travel under entertainment? commerce subs? hosting under sw? esp. platforms (e.g. isps) can be bad
        df.loc[ix, 'cat'] = zu[s][0].split(':')[0]

# debug code:
# for d in ['netflix.com', 'instagram.com', 'microsoft.com', 'hola.org', 'amazonaws.com', 'office365.com', 'w.org',
#           'geenstijl.nl', 'gettyimages.nl', 'volkskrant.nl', 'daskapital.nl']:
#     print('*****', d, '*****')  
#     for u, uu in dbg_map.items():
#          if uu == d:
#              pprint(zj[u])
#     top, top_p = Counter(), Counter()
#     for cats in zu[d][1:]:
#         for c in cats:
#             top[c] += 1
#             top_p[c.split(':')[0]] += 1
#     top = sorted(top.items(), key=lambda x:x[1], reverse=True)  
#     top_p = sorted(top_p.items(), key=lambda x:x[1], reverse=True)  
#     pprint(top, top_p)
        
# STATS: I guess this is ok and what we have. more than half has no TLD
for tld in set(df.tld):
    n = len(df[df.tld==tld])
    n1 = len(df[(df.tld==tld)&~df.cat.isnull()])
    if n1 / n < 0.25 or len(tld) == 3:
        print(tld, round(n1*100/n), '%')                
        # lowest: SI 13, LV 16, HR 18, but also a few 20-30. majority under 50%. org & com around 50%.
for r in range(10, 101, 20):
    n = len(df[df['rank']<=r])
    n1 = len(df[(df['rank']<=r)&(~df.cat.isnull())])
    print(r, round(n1*100/n), '%')  # so after 10, or 100, it's always about 40%

# CONCLUSIONS:
#  - categories generally ok; cuont with weights unnecssary; 
#  - TODO: major problem is when two top ones have same count; (get two?)
#  - TODO: i should check his paper to see what's up with the classification order, how authors use it
#  - (re overlap) let's just use this TLD list; the ones that are empty, are empty. sth to deal with later        
        

('BE', 27.0, '%')
('FR', 66.0, '%')
('DE', 75.0, '%')
('JP', 80.0, '%')
('HU', 28.0, '%')
('COM', 95.0, '%')
('NL', 37.0, '%')
('PT', 19.0, '%')
('RO', 23.0, '%')
('PL', 34.0, '%')
('CH', 35.0, '%')
('GR', 22.0, '%')
('CA', 46.0, '%')
('IT', 41.0, '%')
('CZ', 31.0, '%')
('AU', 59.0, '%')
('AT', 28.0, '%')
('ES', 53.0, '%')
('US', 16.0, '%')
('UK', 69.0, '%')
('ORG', 80.0, '%')
('SE', 30.0, '%')
(10, 84.0, '%')
(30, 73.0, '%')
(50, 68.0, '%')
(70, 65.0, '%')
(90, 61.0, '%')


In [8]:
# export!
dft = df.set_index(['tld', 'rank']).sort_index()
print(len(set(df.tld)), len(df))
print(dft.head())
dft.to_csv('../indata/scanlist_20180423b-tmp.csv')

# TODO: this list is mapped SOOO wrongly that I wonder if I am mapping/loading wrongly

22 4800
         global_rank        domain  cat
tld rank                               
AT  3           1435  univie.ac.at  NaN
    4           1438     kriesi.at  NaN
    5           2231  tuwien.ac.at  NaN
    6           2259        orf.at  NaN
    7           2332     google.at  NaN


In [31]:
# what's overlap with old list non loadables
df_ping = pd.read_csv("../indata/scanlist_20180421-ping.csv", index_col=['tld', 'rank'])
df_ping = df_ping.join(dft, lsuffix='__p')

df_ping[~df_ping.ping_ip.isnull()&df_ping.domain.isnull()]  
df_ping.reset_index(inplace=True)
    # great, so indeed majority of ping/look fails are TLD extraction erros by majestic

In [39]:
#dft.head()
# btw, how many in large scan 20180422 expected to be errors?

lim = ["CH","BE","RO","PL","NL","CA","ES","IT","FR","DE","US","UK","COM"]
print(len(df_ping[df_ping.tld.isin(lim)&df_ping.ping_ip.isnull()&(df_ping['rank']<=50)]))  # 111

# [hadi@econsec06 CookieLaws18]$ grep Finished nohup.out*
# nohup.out.be: Finished crawl (time: 19006s)  > 128 ERROR
# nohup.out.ch: Finished crawl (time: 20605s) > 126 ERROR
# nohup.out.de: Finished crawl (time: 19912s) > 124 ERROR
# nohup.out.fr: Finished crawl (time: 18631s) > 124 ERROR
# nohup.out.nl: Finished crawl (time: 18680s) > 125 ERROR
# nohup.out.ro: Finished crawl (time: 19441s) > 128 ERROR
# nohup.out.us: Finished crawl (time: 20462s) > 118 ERROR

# nohup.out.ca: Finished crawl (time: 19523s) > 122 ERROR
# nohup.out.es: Finished crawl (time: 19871s) > 125 ERROR
# nohup.out.it: Finished crawl (time: 19552s) > 123 ERROR
# nohup.out.pl: Finished crawl (time: 19404s) > 127 ERROR
# nohup.out.uk: Finished crawl (time: 13501s) >> TWO CPUS >> 154 ERROR >> clearly more ! :/



111