In [1]:
import json
import os.path
import sys
import glob
import re
from urllib import parse as up

from bs4 import BeautifulSoup


In [8]:
DATA_LOCATION="../data/1/"

def mkfilepath(name):
    return os.path.join(DATA_LOCATION, name+".har")

def load_har(name):
    filename = mkfilepath(name)
    return json.load(open(filename))

def enumerate_dataset_1():
    for fname in glob.glob(DATA_LOCATION+"*.har"):
        yield fname
    

def about(obj):
    if isinstance(obj, list):
        return "<list of length {0}>".format(len(obj))
    elif isinstance(obj, dict):
        r = "{"
        is_first = True
        for k in obj.keys():
            if is_first:
                is_first = False
            else:
                r += ", "
            r +=  k + ": ..."
        r += "}"
        return r
    
def get_first_entry_content(obj):
    probe_entries = 5
    entry_total = len(obj['log']['entries'])
    for i in range(max(probe_entries, entry_total)):
        if obj['log']['entries'][i]['response']['status'] in (301, 302):
            continue
        cnt = obj['log']['entries'][i]['response']['content']
        if cnt['mimeType'] == 'text/html':
            return cnt['text']
                   
def get_soup(obj):
    contents = get_first_entry_content(obj)
    soup = BeautifulSoup(contents, 'html.parser')              
    return soup

def count_links_on_top(soup):
    head = soup.head 
    return len(head.find_all('link')) + len(head.find_all('script', src=re.compile(".")))

def purename(domain_name):
    domain_parts = domain_name.split('.')
    if len( domain_parts ) == 2:
        return domain_parts[1]
    elif len(domain_parts ) == 3:
        return domain_parts[1]
    elif len(domain_parts) == 1:
        return domain_parts[0]
    else:
        raise ValueError("Domain {0} makes no sense to purename".format(domain_name))

def count_body_links(soup, origin_domain):
    result = 0
    pn = purename(origin_domain)
    body = soup.body
    for (tagname, dattr) in [ ("img", "src"), ("link", "href"), ('script', 'src')]:
        v = body.find_all(tagname, **{dattr :re.compile('.')})
        
        for item in v:
            href = item[dattr]
            parsed_tp = up.urlparse(href)
            #print("pair: " , pn, parsed_tp.netloc, "a: ", href)
            if parsed_tp.netloc in ('', None) or pn in parsed_tp.netloc:
                result += 1
        
    return result

In [9]:
parsing_problems = 0
for fname in enumerate_dataset_1():
    try:
        obj = json.load(open(fname))
    except ValueError:
        parsing_problems += 1
        continue
    dr, domain_name = os.path.split(fname)
    domain_name = domain_name.replace('.har', '')
    # print(domain_name)
    soup = get_soup( obj )
    ln_top = count_links_on_top(soup)
    try:
        ln_body = count_body_links(soup, domain_name)
    except ValueError:
        parsing_problems += 1
        continue
    print(ln_top, ln_body)
print(parsing_problems)

20 7
5 34
10 1
1 3
8 12
2 182
2 1
24 35
3 6
13 45
9 2
12 11
16 6
3 1
17 0
30 6
4 2
17 16
20 0
1 1
10 2
3 1
0 0
45 35
11 38
12 30
11 2
15 17
11 2
10 2
5 0
6 6
3 123
7 271
17 22
13 13
11 77
16 1
10 2
3 0
27 15
5 15
27 1
12 40
5 18
3 11
12 0
25 33
5 7
17 2
10 20
2 10
4 8
51 30
34 58
7 10
38 8
25 1
5 11
18 9
10 15
10 0
11 1
32 16
13 39
16 7
21 20
6 1
58 16
10 107
61 5
28 11
18 12
11 97
2 0
7 101
9 42
5 19
23 0
23 20
19 14
8 3
16 1
5 16
10 51
19 2
3 2
14 41
14 8
7 20
14 0
11 32
14 22
11 11
8 11
8 24
14 31
24 41
6 3
6 2
8 25
9 1
55 0
31 7
8 49
23 24
16 33
27 36
5 61
6 3
19 13
3 0
11


In [10]:
mkfilepath("www.sciencedirect.com")

'../data/1/www.sciencedirect.com.har'

In [4]:
x = load_har("www.sciencedirect.com")

In [5]:
about(x)

'{log: ...}'

In [6]:
about(x['log'])

'{pages: ..., creator: ..., version: ..., entries: ...}'

In [7]:
about(x['log']['entries'])

'<list of length 32>'

In [8]:
about(x['log']['entries'][0])

'{pageref: ..., time: ..., request: ..., timings: ..., startedDateTime: ..., serverIPAddress: ..., response: ..., cache: ..., connection: ...}'

In [9]:
about(x['log']['entries'][0]['response'])

'{headers: ..., status: ..., _transferSize: ..., cookies: ..., headersSize: ..., httpVersion: ..., bodySize: ..., redirectURL: ..., statusText: ..., content: ...}'

In [10]:
about(x['log']['entries'][0]['response']['content'])

'{mimeType: ..., compression: ..., size: ...}'

In [11]:
x['log']['entries'][0]['response']['content']['compression']

0

In [12]:
about(x['log']['entries'][0]['response']['content'])

'{mimeType: ..., compression: ..., size: ...}'

In [13]:
x['log']['entries'][0]['response']['status']

301

In [14]:
get_first_entry_content(x)[:100]

'<!DOCTYPE html>\n<html class="no-js no-svg" lang="en-US">\n<head>\n    <title>ScienceDirect.com | Scien'

In [15]:
cnt = get_first_entry_content(x)

In [16]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(cnt, 'html.parser')

In [17]:
len( soup.find_all('script') ), len( soup.head.find_all('script') )

(14, 4)

In [18]:
count_links_on_top(soup)

5

In [22]:
count_body_links(soup, 'www.sciencedirect.com')

sciencedirect sdfestaticassets-us-east-1.sciencedirectassets.com
sciencedirect cdn.els-cdn.com
sciencedirect sdfestaticassets-us-east-1.sciencedirectassets.com
sciencedirect assets.adobedtm.com


2