In [1]:
import sys
import os
from BreakageClassifier.code.crawl.crawl import Crawler
from dotenv import load_dotenv, find_dotenv
from pathlib import Path
from BreakageClassifier.code.graph.database import Database

import pandas as pd

load_dotenv()
DOTENV_PATH = find_dotenv()
DIR = Path(DOTENV_PATH).parent
IN_PATH = Path("forums-out/easylist")
OUT_PATH = Path("crawl-out/easylist")

SALIENCY_MODEL_PATH = DIR.joinpath("../WebModelGen/block_classifier/pretrained-models/model-0.joblib").resolve()


We need to run the crawling experiment from a console (preferably a detached one). 

```
python ../BreakageClassifier/code/crawl/crawl.py \
                --n <NUMBER OF ISSUES> \
                --issues forums-out/easylist/easylist-data.csv \
                --filters forums-out/easylist/filterlists \
                --scp ../WebModelGen/block_classifier/pretrained-models/model-0.joblib \
                --out crawl-out/easylist/datadir

```

In [3]:
crawler = Crawler(
    data_dir=str(OUT_PATH.joinpath('datadir-test-saliency')),
    saliency_classifier_path=SALIENCY_MODEL_PATH,
)

In [None]:
crawler.crawl_from_dataset(
    IN_PATH.joinpath('easylist-data.csv'),
    IN_PATH.joinpath("filterlists"),
    num=50
)

# Experiments after the crawl

In [3]:
CRAWL_DIR = Path("crawl-out/adguard-full/")

## Saliency Interaction Experiments
This is to show the importance of saliency in
limiting the interactions and providing a decent crawl time
(i.e., efficiency)
- number of salient element per page
- number of salient elements per page that we interact with
- number of possible elements to interact with per page
- average time per interaction

In [10]:
from tqdm import tqdm

stats = []

with Database(CRAWL_DIR / 'crawl-data.sqlite', CRAWL_DIR / 'experiments.csv') as db:
    
    site_visits = db.sites_visits()
    
    for _, issue in tqdm(site_visits.iterrows(), total=len(site_visits)):
        
        if issue.error != False:
            continue
        
        df_dom = db.get_dom_from_visit_id(issue.visit_id_a)
        df_responses = db.get_http_responses(issue.visit_id_a)
        df_javascript = db.get_javascript_events(issue.visit_id_a)
    
        interactions, df_javascript, df_responses = db.get_interaction_logs_all(
            issue.visit_id_a,
            df_javascript,  
            df_responses,
        )
        
        commands = db.get_commands(issue.visit_id_a)
        
        interactable_nodes = ["button", "iframe", "video", "img", "submit", "radio", "checkbox", "input"]
             
        stats.append({
            "issue": issue.issue_id,
            "n_salient": len(df_dom[df_dom.saliency > 0]),
            "n_nodes": len(df_dom),
            "n_interactions": len(interactions),
            "n_candidates": len(df_dom[df_dom.nodeName.isin(interactable_nodes)]),
            "average_interaction_duration": commands[commands.command == "SalientRandomInteractCommand"].duration.mean() / 1000, 
            "n_js_events": df_javascript.interaction.notnull().sum(), 
            "n_requests": df_responses.interaction.notnull().sum(),
        })
        

pd.DataFrame(stats).to_csv(CRAWL_DIR / "saliency-interaction-stats.csv", index=False)
    
    

100%|██████████| 543/543 [49:11<00:00,  5.44s/it]


In [4]:
saliency_interaction_stats = pd.read_csv(CRAWL_DIR / "saliency-interaction-stats.csv")

In [5]:
saliency_interaction_stats.describe()

Unnamed: 0,issue,n_salient,n_nodes,n_interactions,n_candidates,average_interaction_duration,n_js_events,n_requests
count,543.0,543.0,543.0,543.0,543.0,543.0,543.0,543.0
mean,136399.530387,28.600368,774.502762,0.883978,40.707182,9.703912,215.456722,99.867403
std,36509.402275,28.42165,752.973723,1.014371,56.275752,11.86777,389.546152,152.169153
min,51619.0,0.0,7.0,0.0,0.0,0.156,0.0,0.0
25%,140897.0,9.0,284.5,0.0,13.0,2.004,0.0,0.0
50%,151053.0,21.0,573.0,1.0,26.0,6.573,22.0,39.0
75%,156999.5,39.0,1029.0,1.0,48.5,12.8175,253.5,146.0
max,163156.0,210.0,6038.0,3.0,770.0,91.235,3148.0,1079.0


In [9]:
saliency_interaction_stats['r_salient'] = saliency_interaction_stats.apply(lambda x: x.n_salient / x.n_nodes if x.n_nodes > 0 else 0, axis=1)
                                                                           

In [10]:
saliency_interaction_stats['r_salient'].describe()

count    543.000000
mean       0.059029
std        0.068396
min        0.000000
25%        0.014146
50%        0.040541
75%        0.081199
max        0.600000
Name: r_salient, dtype: float64

In [12]:
print("Count without salient nodes: ", len(saliency_interaction_stats[saliency_interaction_stats.n_salient == 0]))
print("Count without interactable nodes: ", len(saliency_interaction_stats[ (saliency_interaction_stats.n_interactions == 0)]))
print("Count without salient and interactable nodes: ", len(saliency_interaction_stats[(saliency_interaction_stats.n_salient == 0) & (saliency_interaction_stats.n_interactions == 0)]))

Count without salient nodes:  59
Count without interactable nodes:  250
Count without salient and interactable nodes:  59


# Debug

In [2]:
from BreakageClassifier.code.crawl.ablockers import adguard, ublock
from BreakageClassifier.code.crawl.crawl import CrawlerConfig, Crawler
from Saliency.classify import SaliencyClassifierConfig
from Saliency.segment.vips.vips import Vips
from Saliency.utils import saliency_score
from pathlib import Path

DEBUG_ISSUE = 165863

# Configuration

saliency_conf = SaliencyClassifierConfig(
    fp=Path("../Saliency/models/rf").resolve(),
    segment=Vips,
    pre_scoring=saliency_score,
    pre_scoring_threshold=0,
)

conf = CrawlerConfig(
    saliency=saliency_conf,
    screenshots=True,
    adblocker=adguard,
    dom_dump_timeout=3 * 60,
    filterlist_load_timeout=4 * 60,
    log_debug=True,
    headless=True
)

crawler = Crawler(
    num_browsers=1,
    data_dir=Path("crawl-out/adguard/adguard-debug-neg"),
    conf=conf,
    forced=True,
)

crawler.debug_issue(Path("../BreakageClassifier/code/forums/adguard/adguard-neg/adguard-data.csv"), Path("../BreakageClassifier/code/forums/adguard/adguard-neg/filterlists/"), DEBUG_ISSUE)

browser_manager      - INFO     - BROWSER 742193888: Launching browser...
browser_manager      - DEBUG    - BROWSER 742193888: Spawn attempt 0 
storage_controller   - DEBUG    - Initializing new handler
deploy_firefox       - DEBUG    - BROWSER 742193888: Saved extension config file to: /tmp/firefox_profile_zg6engpf/browser_params.json
deploy_firefox       - DEBUG    - BROWSER 742193888: OpenWPM Firefox extension loaded
FirefoxExtension     - DEBUG    - Navigation instrumentation enabled
storage_controller   - DEBUG    - Initializing new handler
FirefoxExtension     - DEBUG    - Cookie instrumentation enabled
browser_manager      - DEBUG    - BROWSER 742193888: Looking for extension port information in /tmp/firefox_profile_zg6engpf
FirefoxExtension     - DEBUG    - Javascript instrumentation enabled
browser_manager      - DEBUG    - BROWSER 742193888: Connecting to extension on port 35341
FirefoxExtension     - DEBUG    - HTTP Instrumentation enabled
FirefoxExtension     - DEBUG    - C

Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.163 seconds


storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids


Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.112 seconds
Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.118 seconds


browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: WaitCommand(30)
addon                - INFO     - Waiting 30 seconds


Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.126 seconds


storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids


Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.135 seconds


storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: TryEvadeCookiesBannerCommand
cookies              - INFO     - Unable to evade cookies banner or it doesn't exist...
browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: SaveScreenshotCommand(after-base)
storage_controller   - DEBUG    - Initializing new handler
storage_controller   - INFO     - Terminating handler, because the underlying socket closed
browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: SalientDomDumpCommand
dom                  - DEBUG    - DOM dump received in 0.056164 seconds
classify             - DEBUG    - Encoding DOM

Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.127 seconds
Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.132 seconds


storage_controller   - DEBUG    - StorageController status: There are currently 2 scheduled tasks for 1 visit_ids
browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: WaitCommand(30)
addon                - INFO     - Waiting 30 seconds


Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.137 seconds
Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.201 seconds


storage_controller   - DEBUG    - StorageController status: There are currently 51 scheduled tasks for 1 visit_ids


Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.147 seconds


storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: TryEvadeCookiesBannerCommand
cookies              - INFO     - Unable to evade cookies banner or it doesn't exist...
browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: SaveScreenshotCommand(before-base)
storage_controller   - DEBUG    - Initializing new handler
storage_controller   - INFO     - Terminating handler, because the underlyi

Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.118 seconds


storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids


Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.152 seconds


storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids


Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.138 seconds


browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: WaitCommand(30)
addon                - INFO     - Waiting 30 seconds
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids


Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.136 seconds


storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids


Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.176 seconds


storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids


Executing <Task pending name='Task-12' coro=<StorageController._handler() running at /home/saiid/OpenWPM/openwpm/storage/storage_controller.py:95> wait_for=<Future pending cb=[Task.task_wakeup()] created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/base_events.py:429> created at /home/saiid/miniconda3/envs/openwpm/lib/python3.10/asyncio/streams.py:244> took 0.192 seconds


browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: TryEvadeCookiesBannerCommand
cookies              - INFO     - Unable to evade cookies banner or it doesn't exist...
browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: SaveScreenshotCommand(None-base)
storage_controller   - DEBUG    - Initializing new handler
storage_controller   - INFO     - Terminating handler, because the underlying socket closed
storage_controller   - DEBUG    - StorageController status: There are currently 0 scheduled tasks for 1 visit_ids
browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: DomDumpCommand
storage_controller   - DEBUG    - Initializing new handler
browser_manager      - INFO     - BROWSER 742193888: EXECUTING COMMAND: SalientRepeatInteractCommand
interact             - INFO     - 3 possible interactions
storage_controller   - INFO     - Terminating handler, because the underlying socket closed
interact             - INFO     - <class 'Br

# Stats

## Running time

In [2]:
CRAWL_DIR = Path("crawl-out/adguard-full/")

with Database(CRAWL_DIR / 'crawl-data.sqlite', CRAWL_DIR / 'experiments.csv') as db:
    site_visits = db.sites_visits()
    command_durations = db.get_runtime()

In [7]:
from numpy import result_type


commands = [
    'ClearCookiesCommand',
    'FilterListLoadCommand',
    'GetCommand',
    'TryEvadeCookieBannerCommand',
    'SalientDomDumpCommand',
    'SalientRandomInteractCommand',
    'DOMDumpCommand',
    'SalientRepeatInteractCommand',
]

def get_visit_command_runtimes(df_visit):
    
    out = {}
    
    for command in commands:
        _match = df_visit[df_visit.command == command]
        if len(_match) > 0:
            out[command] = _match.duration.mean() / 1000
        else:
            out[command] = 0
            
    out['total'] = df_visit.duration.sum() / 1000
            
    return pd.Series(out)

    
visit_durations = command_durations.groupby('visit_id').apply(get_visit_command_runtimes).reset_index()

In [24]:
site_durations = []

for _, site in site_visits.iterrows():
    
    for i, visit_id in enumerate([site.visit_id_a, site.visit_id_b, site.visit_id_u]):
    
        _visits_durations=  visit_durations[visit_durations.visit_id == visit_id].copy()
        _visits_durations['issue'] = site.issue_id
        _visits_durations['visit_order'] = i
    
        site_durations.append(_visits_durations)
    
site_durations = pd.concat(site_durations)

In [25]:
site_durations.total.mean()

48.11472007366483

In [26]:
site_durations.to_csv(CRAWL_DIR / "site-durations.csv", index=False)

In [40]:
first_crawls = site_durations[site_durations.visit_order == 2]
first_crawls.total.std()

17.893149708373574

In [36]:
site_durations.total.sum() / 60 / 60

21.771910833333333