In [40]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta, date
from dateutil import parser
import time
import random
import pymongo
from dotenv import load_dotenv
import os
import scraper_faz
import numpy as np
import pandas as pd

In [41]:
load_dotenv()
mongodb_uri = os.getenv("MONGODB_URI")

mongoclient = pymongo.MongoClient(mongodb_uri)
mongodb = mongoclient["media-scraper"]
mongo_articles = mongodb["articles"]
mongo_execs = mongodb["executions"]

In [42]:
execs = pd.DataFrame(list(mongo_execs.find()))
scraped = list(execs["from"].apply(lambda x: x.date()))

last_30 = []
today = date.today()
for day in range(1, 31):
    last_30.append(today - timedelta(days=day))

# Perform the list comprehension operation again
missing_dates = [date for date in last_30 if date not in scraped]
missing_dates.sort()
missing_dates

[datetime.date(2023, 11, 23),
 datetime.date(2023, 11, 24),
 datetime.date(2023, 11, 25),
 datetime.date(2023, 11, 26),
 datetime.date(2023, 11, 27),
 datetime.date(2023, 11, 28),
 datetime.date(2023, 11, 29),
 datetime.date(2023, 11, 30),
 datetime.date(2023, 12, 1),
 datetime.date(2023, 12, 2),
 datetime.date(2023, 12, 3),
 datetime.date(2023, 12, 4),
 datetime.date(2023, 12, 5),
 datetime.date(2023, 12, 6)]

In [43]:
for date in missing_dates:
    date = datetime.combine(date, datetime.min.time())
    print("Time index:", date)
    execution = {}
    articles = scraper_faz.request_articles(
        execution,
        param_from=date,
        param_to=date
    )

    articles_parsed = 0
    articles_inserted = 0
    for article_href in articles:
        try:
            time.sleep(random.uniform(1, 3))
            print("Scraping: ", article_href)
            article_data = scraper_faz.parse_article(article_href)
            print("Parsed: ", article_data)
            articles_parsed += 1
            try:
                mongo_articles.insert_one(article_data)
                print("Inserted instance into database.")
                articles_inserted += 1
            except Exception as ex:
                print("Unable to persist instance: ", ex)
        except:
            print("unable to scrape the article")
        
    execution["articles_parsed"] = articles_parsed
    execution["articles_inserted"] = articles_inserted
    mongo_execs.insert_one(execution)

Time index: 2023-11-23 00:00:00
Requesting:  https://www.faz.net/suche/?query=&ct=article&ct=blog&ct=storytelling&author=&from=22.11.2023&to=22.11.2023
Articles count:  20
Found a next page. Requesting:  https://www.faz.net/suche/s2.html?ct=article&ct=blog&ct=storytelling&&from=22.11.2023&to=22.11.2023#listPagination
Requesting:  https://www.faz.net/suche/s2.html?ct=article&ct=blog&ct=storytelling&&from=22.11.2023&to=22.11.2023
Articles count:  20
Found a next page. Requesting:  https://www.faz.net/suche/s3.html?ct=article&ct=blog&ct=storytelling&&from=22.11.2023&to=22.11.2023#listPagination
Requesting:  https://www.faz.net/suche/s3.html?ct=article&ct=blog&ct=storytelling&&from=22.11.2023&to=22.11.2023
Articles count:  20
Found a next page. Requesting:  https://www.faz.net/suche/s4.html?ct=article&ct=blog&ct=storytelling&&from=22.11.2023&to=22.11.2023#listPagination
Requesting:  https://www.faz.net/suche/s4.html?ct=article&ct=blog&ct=storytelling&&from=22.11.2023&to=22.11.2023
Articles