In [None]:
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from playwright.async_api import async_playwright
from deep_translator import GoogleTranslator

In [None]:
# remove dead urls

ALL_URLS_PATH = "data/blocklist/all_urls_pos.txt"
DEAD_URLS_PATH = "data/blocklist/dead_urls_pos.txt"
ALIVE_URLS_PATH = "data/blocklist/alive_urls_pos.txt"

with open(ALL_URLS_PATH, "r") as input_file:
    all_urls = input_file.readlines()

with open(DEAD_URLS_PATH, "r") as dead_file:
    dead_urls = dead_file.readlines()

all_urls_set = set(all_urls) # converted to set to get 'difference' function
dead_urls_set = set(dead_urls) # converted to set to get 'difference' function
alive_urls = list(all_urls_set.difference(dead_urls_set))

with open(ALIVE_URLS_PATH, "w") as out_file:
    out_file.write("".join(alive_urls))

print("all urls:", len(all_urls))
print("dead urls:", len(dead_urls))
print("alive urls:", len(alive_urls))

In [None]:
# POSITIVES
DATA1_PATH = 'data/blocklist/pos_raw.txt'
DATA2_PATH = 'data/gov/pos_raw.txt'
DATA_CLEAN_PATH = 'data/pos_clean.txt'

# NEGATIVES
# DATA1_PATH = 'data/neg_raw.txt'
# DATA_CLEAN_PATH = 'data/neg_clean.txt'

# minimum #words a website should have
# websites with small #words are often ones that only display a js/cookie-warning and should be discarded
WORD_THRES = 200

with open(DATA1_PATH, 'r') as file:
    data = file.readlines()

with open(DATA2_PATH, 'r') as file:
    data = data + file.readlines()

print("num data before:", len(data))

data = list(set(data)) # remove duplicate website content

data_clean = []
data_removed = []
for d in data:
    d = re.sub(r'[^\w .,;:!?"\'-]', ' ', d, flags=re.UNICODE) # discard invalid characters
    d = re.sub(r'\s+', ' ', d) # compress multiple spaces into one
    d = d.strip() # remove spaces/newlines in front and end
    if len(d.split()) <= WORD_THRES:
        data_removed.append(d)
        continue
    data_clean.append(d)

with open(DATA_CLEAN_PATH, 'w') as file:
    file.write('\n'.join(data_clean))

print("num data after:", len(data_clean))
print("data removed due to length:", len(data_removed))

In [None]:
# investigate what data was removed due to length
# here it should be validated that the discarded content is mostly short js/cookie-warnings

print("data removed due to length (in descending order wrt. words):")
print("-------------------------------------------------------------------------")
for p in sorted(data_removed, key=lambda x: len(x.split()), reverse=True):
    if p != "\n":
        print(p)