In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from playwright.async_api import async_playwright

import nltk
from nltk.corpus import stopwords
from nltk.corpus import words as english_words
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
URLS_PATH = 'data/blocklist/all_urls_pos.txt'
SCRAPE_OUTPUT_PATH = 'data/blocklist/pos_raw.txt'
DEAD_URLS_PATH = 'data/blocklist/dead_urls_pos.txt'

# store urls with connectivity issues (i.e. dead urls) in own file
def write_dead(url):
    with open(DEAD_URLS_PATH, "a") as out_file:
        out_file.write(url+"\n")

TIMEOUT = 10 # seconds
async def run():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        context = await browser.new_context()
        page = await context.new_page()
        await page.set_extra_http_headers({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
            'Accept-Language': 'en-US,en;q=0.5' # set preferred language to english
        })

        with open(URLS_PATH, 'r') as file:
            urls = file.readlines()
        
        for url in tqdm(urls, desc="Fetching URLs"):
            url = url.strip() # remove whitespaces and newlines from beginning/end

            # fetch content 
            content = ""
            try:
                await page.goto("https://"+url, timeout=TIMEOUT*1000)
                content = await page.content()
                soup = BeautifulSoup(content, "html.parser")
                content = soup.get_text(strip=True)
            except Exception as e:
                write_dead(url)
                print(f"Error fetching {url}: {e}")
                continue
            
            if content:
                content = re.sub(r'[^\w .,;:!?"\'-]', ' ', content, flags=re.UNICODE)
                content = re.sub(r'\s+', ' ', content) # compress multiple spaces into one
                content = content.strip()
            
                with open(SCRAPE_OUTPUT_PATH, "a") as out_file:
                    out_file.write(f"\n\n{content}\n\n")
            else:
                write_dead(url)


        await context.close()
        await browser.close()

await run()

In [3]:
# remove dead urls
INPUT_ALL = "data/blocklist/all_urls_pos.txt"
INPUT_DEAD = "data/blocklist/dead_urls_pos.txt"
OUTPUT = "data/blocklist/alive_urls_pos.txt"


with open(INPUT_ALL, "r") as input_file:
    all_urls = input_file.readlines()

with open(INPUT_DEAD, "r") as dead_file:
    dead_urls = dead_file.readlines()



alive = list(set(all_urls).difference(set(dead_urls)))

print(len(all_urls))
print(len(dead_urls))
print(len(alive))

with open(OUTPUT, "w") as out_file:
    out_file.write("".join(alive))


2608
1734
1045


In [53]:
# POSITIVES
DATA1_PATH = 'data/blocklist/pos_raw.txt'
DATA2_PATH = 'data/gov/pos_raw.txt'
DATA_CLEAN_PATH = 'data/pos_clean.txt'

# NEGATIVES
# DATA1_PATH = 'data/neg_raw.txt'
# DATA_CLEAN_PATH = 'data/neg_clean.txt'

WORD_THRES = 200

with open(DATA1_PATH, 'r') as file:
    data = file.readlines()

with open(DATA2_PATH, 'r') as file:
    data = data + file.readlines()


print("before:", len(data))

data = list(set(data)) # remove duplicates

data_clean = []
data_removed = []
for p in data:
    p = re.sub(r'[^\w .,;:!?"\'-]', ' ', p, flags=re.UNICODE) # only keep valid characters
    p = re.sub(r'\s+', ' ', p) # compress multiple spaces into one
    p = p.strip() # remove spaces in front and end
    if len(p.split()) <= WORD_THRES:
        data_removed.append(p)
        continue
    data_clean.append(p)

with open(DATA_CLEAN_PATH, 'w') as file:
    file.write('\n'.join(data_clean))

print("after:", len(data_clean))
print("removed due to length:", len(data) - len(data_clean))
print()

print("removed due to length descending:")
for p in sorted(data_removed, key=lambda x: len(x.split()), reverse=True):
    if p != "\n":
        print(p)

before: 2882


after: 751
removed due to length: 291

removed due to length descending:
Lotto, Keno, Pitkäveto ja muut pelit verkkokaupasta - VeikkausVeikkausSiirry sisältöönSaavutettavuusPelaamisen hallintaPelitLototVedotKasino ja PokeriNettiarvat ja eBingoRahansiirtoOmat pelitTuloksetPorukkapelitVeikkausTVPikalinkitSovellusX-lehtiPelaa maltillaAsiakaspalveluKirjaudutai rekisteröidyEtusivuOma pelaaminenPelaamisen hallintaEdut ja arvonnatOmat tiedotTuloksetPorukkapelitLisääEtusivuVikinglotossa jaossa 25 miljoonaa euroa keskiviikon arvonnasta yli neljän miljoonan euron voittoNäytä uutisetPelaaminen tunnistautuneenaTilaa Veikkaus-korttiVikinglotto25 000 000 JackpotEurojackpot67 000 000 Lotto7 000 000 KenoKirjaudu sisäänKäyttäjätunnusSalasanaKirjauduUnohtuiko käyttäjätunnus tai salasana?AjankohtaistaJoulun vetovoimaisin kalenteriKohteissa yli miljoona euroa bonusrahaaJoulukalenteriin KenoIsoja ja pieniä unelmia kolmesti päivässäLottopelitMörkökuoroSisäänkirjautumalla saat lisätietoaNettiarpaKurkot ja Ro