# Extract and save as JSON

In [1]:
from bs4 import BeautifulSoup
from datetime import datetime
import glob
import re
import os
import json
import locale
from pathlib import Path
locale.setlocale(locale.LC_TIME, "de_ch")

'de_ch'

In [2]:
path_in = '../data/article/'
path_out = '../data_cleaned/article/'

In [15]:
def one_or_warn(selector):
    l = soup.select(selector)
    if(len(l) > 1):
        print("More than one %s at %s" % (selector, f))
    if(len(l) > 0):
        return l[0].string
    return ""

reg_abbr = re.compile('\((([^)]*))\)[^(]*$', re.IGNORECASE)

records = []
filelist = glob.glob("%s*.html" % path_in)
for f in filelist:
#if True:
    #f = '../data/article/181-migranten-auf-lampedusa-gelandet.html'
    
    # Get ID
    id = os.path.basename(f).replace('.html', '')
    
    # Check if exists
    #if os.path.isfile('%s%s.json' % (path_out, id)):
    #    continue
        
    #print(f)
    
    record = {'id': id}
    
    soup = BeautifulSoup(open(f, encoding='utf-8'), "html.parser")

    # Find Tag
    record['tag'] = one_or_warn('.esf-img-tag span')
    
    # Tag 2 (Gastkommentare falsch Kategorisiert)
    record['tag2'] = record['tag']
    lead = soup.select_one('.text p strong')
    if lead:
        if any(s.lower() in lead.text.lower() for s in ['Gastbeitrag', 'Gastkommentar', 'Kommentar']):
            record['tag2'] = 'Kommentar'

    # Find Title
    record['title'] = one_or_warn('h1')

    # Lead
    #record['lead'] = soup.select_one('.text p strong').string

    # Text
    record['text'] = soup.select_one('.text').text

    # Date
    icon = soup.select_one('img[src="https://www.kath.ch/wp-content/themes/cathkathcatt/images/icon_date.png"]')
    if(len(icon) == None):
        print("No date found at %s" % f)
    date_s = icon.parent.parent.select_one('.col-9').string.replace('\n', '').strip()
    date = date_s.split('|')[0].strip()
    time = date_s.split('|')[1].strip()

    d = datetime.strptime("%s %s" % (date, time), '%d. %B %Y %H:%M')
    record['date'] = d.strftime("%Y-%m-%d %H:%M")

    # Get author
    s = soup.select('.text>p')
    #if len(s) >= 2:
    if (len(s) > 1) and (record['tag'] != "Zitat"):
        s = s[1].text
        if((len(s) > 0) and (len(s.split(' ')) <= 6)):
            # Looks like there is an author
            record['author'] = s.strip()

        else:
            # Try to guess abbrev.
            
            # First find last Paragraph (ignore Video, emptys etc.)
        
            last_paragraph = None
            paragraphs = soup.select('.text>p')
            for i in range(len(paragraphs) - 1, 0, -1):
                if (len(paragraphs[i].contents) > 0) and (paragraphs[i].contents[0].name == None):
                    # Check if not empty
                    if paragraphs[i].text.strip() != "":
                        last_paragraph = paragraphs[i]
                        break
                
            if last_paragraph == None:
                print(f)
                print("No Last paragraph!")
            else:              
                reg = reg_abbr.search(last_paragraph.text)
                if reg:
                    record['author'] = reg.group(1).strip()

                
    if ('autor' in record) == False:
        # Check, if Rauch in Lead
        lead = soup.select_one('.text p strong')
        if lead and ('raphael rauch' in soup.select_one('.text p strong').text.lower()):
            record['author'] = 'Raphael Rauch'
        
    # Save to file
    with open('%s%s.json' % (path_out, record['id']), 'w', encoding='UTF-8') as fp:    
        json.dump(record, fp, ensure_ascii=False)
        
    
print("finito")

../data/article\alles-so-still-hier.html
No Last paragraph!
More than one h1 at ../data/article\anerkennung-meines-einsatzes-fuer-menschenrechte.html
../data/article\auch-in-der-kirche.html
No Last paragraph!
../data/article\auf-dem-jakobsweg-redet-man-selten-uebers-wetter.html
No Last paragraph!
../data/article\autoritaetsstrukturen-in-frauengemeinschaften.html
No Last paragraph!
../data/article\besucherzahlen-dank-des-graffiti-deutlich-gestiegen.html
No Last paragraph!
../data/article\corona-litanei.html
No Last paragraph!
More than one h1 at ../data/article\damit-es-weihnachten-werden-kann.html
../data/article\dann-also-los.html
No Last paragraph!
../data/article\das-geschehen-im-innern-gegen-aussen-transparent-machen.html
No Last paragraph!
../data/article\das-halte-ich-fuer-eine-selbstverleugnung-der-eigenen-kultur.html
No Last paragraph!
../data/article\das-ist-eine-kapitulation.html
No Last paragraph!
../data/article\das-kloster-engelberg-war-immer-auch-geschaeftstaetig.html
No 

## Reload files with "Kostenpflichtig"

In [None]:
from requests_html import HTMLSession

url = 'https://www.kath.ch/wp-content/themes/cathkathcatt/ajax.php?esf_showTwocolumns'
article_folder = '../data/article'

In [None]:
def download_page(name):

    session1 = HTMLSession()

    article_file_name = "%s/%s.html" % (article_folder, name)
    session_article = HTMLSession()
    r2 = session_article.get("https://www.kath.ch/%s/" % name)
    open(article_file_name, 'w', encoding='utf-8').write(r2.text)

In [None]:
filelist = glob.glob("%s*.html" % path_in)
for f in filelist:
#f = '../data/article/gaelle-may-gewinnt-mit-ihrer-karwoche-den-good-news-preis-2021.html'
    content = open(f, encoding='utf-8')
    if content.read().find('Der Zugriff auf diesen Beitrag ist kostenpflichtig') >= 0:
        id = os.path.basename(f).replace('.html', '')
        print("Kostenpflichtig: %s" % id)
        
        # Download
        download_page(id)
        
        # Delete Json
        # Check if exists
        cleaned = Path(path_out) / Path(id + '.json')
        if os.path.isfile(cleaned):
            os.remove(cleaned)
        