In [1]:
import json
from tld import get_fld

In [2]:
def read_har_file(filepath: str) -> list[dict]:
    with open(filepath, 'r') as har_file:
        return json.load(har_file)['log']['entries']

accept_list = read_har_file('zalando.nl_accept.har')
reject_list = read_har_file('zalando.nl_reject.har')

In [28]:
def entry_has_header(entry: dict, entry_component: str, header_name: str) -> bool:
    valid_entry_components = ('request', 'response')
    if entry_component not in valid_entry_components:
        raise RuntimeError(f'attr \'entry_component\' must be one of {valid_entry_components}')
    
    for header in entry[entry_component]['headers']:
        if header.get('name') == header_name:
            return True
    return False

def is_third_party(first_party_domain: str, entry: dict) -> bool:
    return first_party_domain != get_fld(entry['request'].get('url'))


In [37]:
def produce_dict(har_content: list[dict], domain: str) -> dict:
    result_dict = {}
    result_dict['num_reqs'] = len(har_content)
    result_dict['num_requests_w_cookies'] = len(list(filter(lambda entry: entry_has_header(entry, 'request', 'cookie'), har_content)))
    result_dict['num_responses_w_cookies'] = len(list(filter(lambda entry: entry_has_header(entry, 'response', 'set-cookie'), har_content)))
    result_dict['third_party_domains'] = list(
        set(map(lambda entry: get_fld(entry['request'].get('url')),
            filter(lambda entry: is_third_party(domain, entry), har_content))))
    return result_dict


In [40]:
produce_dict(accept_list, 'zalando.nl')['third_party_domains']

['usabilla.com',
 'googleadservices.com',
 'facebook.com',
 'facebook.net',
 'd6tizftlrpuof.cloudfront.net',
 'doubleclick.net',
 'googletagmanager.com',
 'google-analytics.com',
 'ztat.net',
 'usercentrics.eu',
 'google.com',
 'google.nl']