In [1]:
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from playwright.async_api import async_playwright

In [None]:
# scrape website content from url-list

URLS_PATH = 'data/blocklist/all_urls_pos.txt'
SCRAPE_OUTPUT_PATH = 'data/blocklist/pos_raw.txt'
DEAD_URLS_PATH = 'data/blocklist/dead_urls_pos.txt'

# store urls with connectivity issues (i.e. dead urls) in own file
def write_dead(url):
    with open(DEAD_URLS_PATH, "a") as out_file:
        out_file.write(url+"\n")

TIMEOUT = 10 # seconds
async def run():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        context = await browser.new_context()
        page = await context.new_page()
        await page.set_extra_http_headers({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
            'Accept-Language': 'en-US,en;q=0.5' # set preferred language to english
        })

        with open(URLS_PATH, 'r') as file:
            urls = file.readlines()
        
        for url in tqdm(urls, desc="Fetching URLs"):
            url = url.strip() # remove whitespaces and newlines from beginning/end

            try:
                await page.goto("https://"+url, timeout=TIMEOUT*1000)
                content = await page.content()
                soup = BeautifulSoup(content, "html.parser")
                content = soup.get_text(strip=True) # extract visible text only and strip it wrt. spaces
            except Exception as e:
                write_dead(url) # something went wrong with this url
                print(f"Error fetching {url}: {e}")
                continue
            
            # the preprocessing here is only done for readability of the raw data and easier handling
            if content: # some valid text was obtained
                content = re.sub(r'[^\w .,;:!?"\'-]', ' ', content, flags=re.UNICODE) # only retain valid characters
                content = re.sub(r'\s+', ' ', content) # compress multiple spaces into one
            
                with open(SCRAPE_OUTPUT_PATH, "a") as out_file:
                    out_file.write(f"{content}\n")
            else: # empty content was obained
                write_dead(url)

        await context.close()
        await browser.close()

await run()

In [3]:
# remove dead urls

ALL_URLS_PATH = "data/blocklist/all_urls_pos.txt"
DEAD_URLS_PATH = "data/blocklist/dead_urls_pos.txt"
ALIVE_URLS_PATH = "data/blocklist/alive_urls_pos.txt"

with open(ALL_URLS_PATH, "r") as input_file:
    all_urls = input_file.readlines()

with open(DEAD_URLS_PATH, "r") as dead_file:
    dead_urls = dead_file.readlines()

all_urls_set = set(all_urls) # converted to set to get 'difference' function
dead_urls_set = set(dead_urls) # converted to set to get 'difference' function
alive_urls = list(all_urls_set.difference(dead_urls_set))

with open(ALIVE_URLS_PATH, "w") as out_file:
    out_file.write("".join(alive_urls))

print("all urls:", len(all_urls))
print("dead urls:", len(dead_urls))
print("alive urls:", len(alive_urls))

all urls: 2608
dead urls: 13
alive urls: 2595


In [4]:
# POSITIVES
DATA1_PATH = 'data/blocklist/pos_raw.txt'
DATA2_PATH = 'data/gov/pos_raw.txt'
DATA_CLEAN_PATH = 'data/pos_clean.txt'

# NEGATIVES
# DATA1_PATH = 'data/neg_raw.txt'
# DATA_CLEAN_PATH = 'data/neg_clean.txt'

# minimum #words a website should have
# websites with small #words are often ones that only display a js/cookie-warning and should be discarded
WORD_THRES = 200

with open(DATA1_PATH, 'r') as file:
    data = file.readlines()

with open(DATA2_PATH, 'r') as file:
    data = data + file.readlines()

print("num data before:", len(data))

data = list(set(data)) # remove duplicate website content

data_clean = []
data_removed = []
for d in data:
    d = re.sub(r'[^\w .,;:!?"\'-]', ' ', d, flags=re.UNICODE) # discard invalid characters
    d = re.sub(r'\s+', ' ', d) # compress multiple spaces into one
    d = d.strip() # remove spaces/newlines in front and end
    if len(d.split()) <= WORD_THRES:
        data_removed.append(d)
        continue
    data_clean.append(d)

with open(DATA_CLEAN_PATH, 'w') as file:
    file.write('\n'.join(data_clean))

print("num data after:", len(data_clean))
print("data removed due to length:", len(data_removed))

num data before: 524
num data after: 150
data removed due to length: 41


In [5]:
# investigate what data was removed due to length
# here it should be validated that the discarded content is mostly short js/cookie-warnings

print("data removed due to length (in descending order wrt. words):")
print("-------------------------------------------------------------------------")
for p in sorted(data_removed, key=lambda x: len(x.split()), reverse=True):
    if p != "\n":
        print(p)

data removed due to length (in descending order wrt. words):
-------------------------------------------------------------------------
parimatchwinPokies OnlineCasino gamesSlotsCardRouletteVideo PokerPromotionsTournamentsLotteryQuestMissionVIPPayments Wheel of FortuneLoginSign UpDepositWELCOME PACK UP TO 10,000 325 FSMoreSAFARI RAGE LOTTO: 4,000 FS TO GET WEEKLYWIN MY PART!40,000 FSHELLO PRIZE DRAWGET MY PRIZE!LIVE WELCOME BONUS 100 TO 1,000Get bonus!Get bonus!DepositPARIMATCH RACE WIN 150 150 FSGET STARTED!POPULARAllNEWAllHOLD WINAllHOLD THE JACKPOTAllCard Table GamesAllGo to all gamesWHY PLAY ATPARIMATCH CASINOGENEROUS BONUSESSee All PromotionsBITCOIN FRIENDLYLearn About BTC24 7 ONLINE SUPPORTVIP PROGRAM WITH UNIQUE OFFERSWE SUPPORT RESPONSIBLE GAMBLINGLearn MoreSupportTerms and ConditionsBonus TermsPrivacy PolicyAbout BTCFAQResponsible GamingAffiliatesPersonality QuizCrypto WalletInstall AppCopyright 2021 - 2023 Parimatchwin.com is owned and operated by Hollycorn N.V., a company reg