In [21]:
import pandas as pd
import re
import ast
import unicodedata
from tqdm import tqdm
from docx import Document
from datetime import datetime
import locale
locale.setlocale(locale.LC_ALL, "nl_NL")

'nl_NL'

In [22]:
with open('paper_selectors.txt', 'r') as file:
    papers = ast.literal_eval(file.read())

with open('party_selectors.txt', 'r') as file:
    parties = ast.literal_eval(file.read())

with open('months.txt', 'r') as file:
    months = ast.literal_eval(file.read())

In [23]:
# Define a function to extract article information
def extract_article_info(article):
    # Initialize variables to store article information
    newspaper = title = date = author = section = body = None

    # Use regular expressions to extract information
    for paper in papers:
        for month in months:
            metadata = re.search(rf'\s(.*?)\s({paper})\s(\d+\s{month}\s2023)', article[0])
            if metadata is not None:
                title, newspaper, date = metadata.groups()

    author_match = re.search(r'Byline:\s(.*?)\s(Body|Highlight:)', article[1])
    if author_match is not None:
        author = author_match.group(1)

    section_match = re.search(r'Section:\s(.*?);', article[1])
    if section_match:
        section = section_match.group(1)

    body_match = re.search(r'Body\s{3,}?(.*)\s+?(PDF-bestand|Bekijk\shier|Bekijk\sde\soorspronkelijke\spagina|Link\snaar\sPDF)', article[1])
    if not body_match:
        body_match = re.search(r'Body\s{3,}?(.*)', article[1])
    if body_match:
        body = body_match.group(1).strip()

    try:
        date_obj = datetime.strptime(date, "%d %B %Y")
    except:
        date_obj = None

    return {
        'Newspaper': newspaper,
        'Title': None if title == 'No Headline In Original' else title,
        'Date': date_obj,
        'Author': author,
        'Section': section,
        'Body': body
    }

In [24]:
articles = []

today = datetime.now().strftime('%Y-%m-%d')

dates = pd.date_range('2023-08-21', f'{today}', freq='D').strftime('%Y-%m-%d')

for date in tqdm(dates):
    articles_in_day = []
    for n in range(8):

        try:
            docs = ""

            doc = Document(f"../../data/raw_dataset_new/{date} {n}.DOCX")
            for para in doc.paragraphs:
                docs = docs + para.text + " "

            # Split the text into individual articles based on "End of Document" separator, remove last empty article
            articles_otd = re.split(r'End of Document', docs)
            articles_otd = [article.split("\n") for article in articles_otd]
            articles_otd.pop(-1)
            articles_otd = [[article[1], article[3]] for article in articles_otd]
            articles_otd = [[articles[0], unicodedata.normalize('NFKD', articles[1])] for articles in articles_otd]

            for article_otd in articles_otd:
                articles_in_day.append(extract_article_info(article_otd))

        except:
            #TODO error catching
            pass
            # print(f"{date} {n} coombination not found", end="/r")
    

    articles = articles + articles_in_day

100%|██████████| 109/109 [01:23<00:00,  1.30it/s]


In [25]:
df = pd.DataFrame.from_dict(articles)
df.to_json("../../data/01_cleaned_articles.json")