In [20]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

import time
import random

from bs4 import BeautifulSoup
import requests as req
from fake_useragent import UserAgent
import pandas as pd

import json
import w3lib.html

from lxml import html
from tqdm import tqdm

In [21]:
class Scraper():
    def __init__(self):
        self.ua = UserAgent()
        self.headers = req.utils.default_headers()
    
    def index_archive(self):
        date_range = list(pd.date_range('2023-08-01', '2023-08-02', freq='D'))

        dates = []
        for d in date_range:
            dates.append(str(d)[:10].replace('-', '/'))
        
        self.archive_urls = [f"https://www.parool.nl/archief/{date}" for date in dates]
        
    def index_urls(self):
        
        def fetchArchiveURL(url):
            document = {}

            try:
                # Random User Agent
                self.headers.update({'User-Agent': self.ua.random,})
                r = req.get(url)
                r.raise_for_status()
            except req.exceptions.HTTPError as err:
                pass
            else:        
                document['url'] = url
                document['raw_html'] = r.content
                document['in_cache_date'] = datetime.utcnow()

            return document
        
        def parseArchiveURL(url):
            doc = fetchArchiveURL(url)
            raw_html = doc['raw_html']
            soup = BeautifulSoup(raw_html, 'html')
            tree = html.fromstring(raw_html)

            try:
                article_urls = tree.xpath('/html/body/main/div[2]/*/a[@class="teaser__link"]/@href')
            except: 
                article_urls = None


            return article_urls
        
        self.article_urls = []
        print('Indexing urls: ')
        for url in tqdm(self.archive_urls):
            # Simulate human usage 
            time.sleep(random.randint(0, 3))
            self.article_urls.extend(parseArchiveURL(url))
        
        self.article_urls = [x for x in self.article_urls]
    
    def crawl_articles(self):
        
        def fetchArticle(url):
            document = {}

            try:
                # Random User Agent
                self.headers.update({'User-Agent': self.ua.random,})
                r = req.get(url)
                r.raise_for_status()

            except req.exceptions.HTTPError as err:
                # TODO: add error handling
                pass

            else:        
                document['url'] = url
                document['raw_html'] = r.content
                document['in_cache_date'] = datetime.utcnow()
                document['parsed'] = 0

            return document
        
        self.articles = []
        print('Crawling articles: ')
        for url in tqdm(self.article_urls):
            # Simulate human usage
            time.sleep(random.randint(0, 3))
            self.articles.append(fetchArticle(url))
    
    def clean_articles(self):
        
        def parse_article(r):
            try:
                a = {}

                a['url']  = r['url']

                tree = html.fromstring(r['raw_html'])

                try:

                    a['timestamp'] = tree.xpath('/html/body/main/article/header/section/time[@class="artstyle__production__datetime"]/@datetime')[0].strip()
                except: 
                    a['timestamp'] = None
                try:
                    a['title'] = tree.xpath('//h1/text()')[0].strip()
                except:
                    a['title'] = None
                try:
                    a['publisherID'] = tree.xpath('/html/body/main/article/header/section/span[1]/a/text()')[0].strip()
                except:
                    a['publisherID'] = None
                try:
                    a['cleantext'] = ''.join(tree.xpath('//html/body/main/article/section/section/*/text()')[1:])
                except:
                    a['cleantext'] = None
                try:
                    a['category'] = tree.xpath('/html/body/main/section/div[2]/h2/a/span/text()')[0].strip()
                except:
                    a['category'] = None
            except:
                pass

            return a


        print('Cleaning articles:')
        self.cleaned_articles = []
        for article in tqdm(self.articles):
            self.cleaned_articles.append(parse_article(article))
    
    def save_json(self):
        
        with open('HetParool.json', 'w') as file:
            json.dump(self.cleaned_articles, file, indent=4)
            
    

In [22]:
scraper = Scraper()

In [23]:
scraper.index_archive()

In [24]:
scraper.index_urls()

Indexing urls: 


100%|██████████| 2/2 [00:02<00:00,  1.44s/it]


In [25]:
print(scraper.article_urls)

['https://www.parool.nl/wereld/aanklacht-tegen-donald-trump-wegens-poging-tot-belemmeren-verkiezingsuitslag-in-2020~b1217c6b/', 'https://www.parool.nl/nederland/aantal-vliegreizigers-schiphol-groeit-onstuimig-met-52-5-miljoen-tweede-luchthaven-van-eu~bfc809a8/', 'https://www.parool.nl/kunst-media/rechtszaak-tegen-lizzo-zangeres-dwong-haar-personeel-in-amsterdamse-stripclub-tot-seksuele-handelingen~bdbbecb3/', 'https://www.parool.nl/nederland/overval-op-winkel-gelderlandplein-twee-verdachten-aangehouden~be21624e/', 'https://www.parool.nl/nederland/omtzigt-stelt-besluit-over-politieke-toekomst-uit-eerst-op-vakantie~baba0477/', 'https://www.parool.nl/nederland/ihattaren-op-weg-naar-turkse-club-samsunspor~bab7836c/', 'https://www.parool.nl/ps/pap-huilde-omdat-hij-overspoeld-werd-door-een-intens-geluk-over-ons-gezin~bb388ec3/', 'https://www.parool.nl/ps/een-nieuwe-hobby-dankzij-storm-poly-van-omgewaaide-bomen-parkbankjes-maken~b91ed4fb/', 'https://www.parool.nl/ps/ooit-ontwierp-annet-weelin

In [26]:
print(len(scraper.article_urls))

85


In [27]:
scraper.crawl_articles()

Crawling articles: 


100%|██████████| 85/85 [03:09<00:00,  2.22s/it]


In [28]:
scraper.clean_articles()

Cleaning articles:


100%|██████████| 85/85 [00:00<00:00, 291.80it/s]


In [29]:
scraper.save_json()

In [52]:
df = pd.read_json('HetParool.json')

display(df['cleantext'].iloc[0])

'\n   \n    \n      Trump is op vier punten, waaronder samenzwering om de Verenigde Staten te bedriegen, aangeklaagd voor zijn pogingen om de uitslag van de presidentsverkiezingen van 2020 om te keren. De aanklacht is de derde strafzaak die is aangespannen tegen de voormalige president, tevens huidige koploper in de race om de Republikeinse nominatie voor de presidentsverkiezingen van 2024.\n    \n      \n    \n      In de aanklacht van 45 pagina’s staat dat Trump na zijn verlies in 2020 ‘vastbesloten was om aan de macht te blijven’ en samenzweringen pleegde die gericht waren op een ‘fundamentele functie van de federale regering van de Verenigde Staten: het nationale proces van het verzamelen, tellen en certificeren van de resultaten van de presidentsverkiezingen.’\n    \n      In de aanklacht worden zes mede-samenzweerders genoemd. Hun namen worden niet genoemd. Ook is niet duidelijk of zij apart zullen worden aangeklaagd.\n    \n      \n    \n      Een woordvoerder van Trump vergelee