In [1]:
import os
import re
import pandas as pd
from pathlib import Path

base_dir = Path(os.getcwd()).parent

def process_articles(file_path):

    all_articles = []

    with file_path.open ('r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        articles = content.split('##U') # articles are separated by ##U
        for article in articles[1:]:
            match = re.search(r'##A (\d+)>[\s\S]+?##M (\d+)>[\s\S]+?##D (\d+)>', article)
            paper_match = re.search(r'##B (\w+)>', article)
            if match and paper_match:
                year, month, day = match.groups()
                paper = paper_match.group(1)
                text_start = article.find('##D') + len('##D 00>') # skip the metadata
                text = article[text_start:].strip()
                text = re.sub(r'^Publisert:.*$', '', text, flags=re.MULTILINE).strip()
                all_articles.append({
                    'date' : f'{year}-{month}-{day}',
                    'paper': paper,
                    'text': text
                })
    return all_articles

all_articles = []

base_path = base_dir / 'data' / 'norsk_aviskorpus_v2' / '2'

for paper_folder in base_path.iterdir():
    for nested_folder in paper_folder.iterdir():
        for year_folder in nested_folder.iterdir():
            for day_file in year_folder.glob('*.html4'):
                all_articles.extend(process_articles(day_file))

df = pd.DataFrame(all_articles)

In [2]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text) 
    text = re.sub(r'\s([?.!"](?:\s|$))', r'\1', text)  
    return text.strip()

for column in df.columns:
    df[column] = df[column].apply(preprocess_text)

In [3]:
print(df.iloc[0]['text'])

3. plass til rally-petter petter solberg endte p 3. plass i rally new zealand. petter solberg humper avgrde sndag i rally new zealand. foto: ross land/ap nordmannen var sjansels siste dag og tapte ytterligere 10 sekunder til vinner sebastien loeb og toer marcus grnholm. likevel beholder solberg (26 poeng) ledelsen i vm sammenlagt med ett poeng foran loeb (25). franskmannen var i en klasse for seg og vant ogs rallyets siste fartsprve. til slutt skilte 49,8 sekunder til toer grnholm, som kjrte sterkt og banket petter solberg med 18,9 sekunder sammenlagt. firer ble belgieren franois duval, nesten minuttet bak solberg. neste vm-runde finner sted p sardinia i italia om tre uker. ( ntb)


In [4]:
df = df[df['text'].str.len() > 0] # remove empty articles
df = df[df['date'].str[:2] != '11'] # remove articles from 2011, because it is missing some months

In [5]:
df['id'] = range(len(df)) # add an id column

In [6]:
df.to_csv(base_dir / 'data' / 'all_articles_from_norsk_aviskorpus.csv', index=False)