In [2]:
import json
from tld import get_fld

In [3]:
def read_har_file(filepath: str) -> list[dict]:
    with open(filepath, 'r') as har_file:
        return json.load(har_file)['log']['entries']

accept_list = read_har_file('zalando.nl_accept.har')
reject_list = read_har_file('zalando.nl_reject.har')

In [4]:
def entry_has_header(entry: dict, entry_component: str, header_name: str) -> bool:
    valid_entry_components = ('request', 'response')
    if entry_component not in valid_entry_components:
        raise RuntimeError(f'attr \'entry_component\' must be one of {valid_entry_components}')
    
    for header in entry[entry_component]['headers']:
        if header.get('name') == header_name:
            return True
    return False

def is_third_party(first_party_domain: str, entry: dict) -> bool:
    return first_party_domain != get_fld(entry['request'].get('url'))

def is_age_greater_than(cookie: str, min_age: int) -> bool:
    cookie_attrs = cookie.split(';')

def has_tracking_cookies(entry: dict):
    for header in entry['responde']['headers']:
        if header.get('name') == 'set-cookie' and 'SameSite=None' in header.get('value') and is_age_greater_than(header.get('value', 60)):
            return True
    return False

In [5]:
def produce_dict(har_content: list[dict], domain: str) -> dict:
    result_dict = {}
    result_dict['num_reqs'] = len(har_content)
    result_dict['num_requests_w_cookies'] = len(list(filter(lambda entry: entry_has_header(entry, 'request', 'cookie'), har_content)))
    result_dict['num_responses_w_cookies'] = len(list(filter(lambda entry: entry_has_header(entry, 'response', 'set-cookie'), har_content)))
    result_dict['third_party_domains'] = list(
        set(map(lambda entry: get_fld(entry['request'].get('url')),
            filter(lambda entry: is_third_party(domain, entry), har_content))))
    return result_dict


In [6]:
produce_dict(accept_list, 'zalando.nl')

{'num_reqs': 556,
 'num_requests_w_cookies': 124,
 'num_responses_w_cookies': 98,
 'third_party_domains': ['usercentrics.eu',
  'googleadservices.com',
  'd6tizftlrpuof.cloudfront.net',
  'google.com',
  'doubleclick.net',
  'google.nl',
  'ztat.net',
  'facebook.net',
  'googletagmanager.com',
  'usabilla.com',
  'facebook.com',
  'google-analytics.com']}

In [11]:
list(map(lambda x: x.strip(),"fvgs_ml=mosaic; Path=/; Domain=zalando.nl; Expires=Fri, 08 Mar 2024 14:00:54 GMT; Max-Age=777600; Secure".split(';')))

['fvgs_ml=mosaic',
 'Path=/',
 'Domain=zalando.nl',
 'Expires=Fri, 08 Mar 2024 14:00:54 GMT',
 'Max-Age=777600',
 'Secure']