In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

root_url = "https://www.heise.de"

def parse_rating(rating_img):
    if rating_img is not None:
        return int(re.search(r'wertung_(\d)', rating_img['src']).group(1))
    
def parse_response_count(_response_count):
    if _response_count is not None:
        return int(re.search(r'\d+', _response_count.text.strip()).group(0))
    
    
root_soup = BeautifulSoup(requests.get(root_url).text, 'html.parser')

articles = root_soup.find_all(class_='a-article-teaser')
print(len(articles), "articles found")

# TODO: loop through articles

article = articles[0]
article_id = article['data-cid']

In [None]:
import sys
print(sys.version, sys.platform, sys.executable)
from tqdm import tqdm

def load_posting_content(_posting_url):
    posting_soup = BeautifulSoup(requests.get(_posting_url).text, 'html.parser')
    return posting_soup.find(class_='bbcode_v1').text.strip()


def load_postings(_article_id):
    article_soup = BeautifulSoup(requests.get(root_url + '/-' + str(_article_id)).text, 'html.parser')
    comments_url = root_url + article_soup.find(class_='comment-button')['href'][:-8] + 'chronological/'
    
    # loop through comment pages
    posts = []
    _page = 1
    while(True):
        print(f"fetching page {_page}...")
        _comments_page_url = comments_url + 'page-' + str(_page)
        _comments_page_soup = BeautifulSoup(requests.get(_comments_page_url).text, 'html.parser')
        _page_posts = _comments_page_soup.find_all(class_='posting_element')
        if(len(_page_posts) > 0):
            print(f"found {len(_page_posts)} posts")
            posts = posts + _page_posts
        else:
            print(f"No posts found. Exiting loop...")
            break
        _page += 1

    df = pd.DataFrame({
        'article_id': _article_id,
        'article_url': root_url + '/-' + str(_article_id),
        'article_title': article_soup.find("meta", attrs={'name': 'title'})['content'].strip(),
        'article_keywords': article_soup.find("meta", attrs={'name': 'keywords'})['content'],
        'article_comments_count': int( re.search(r'\((\d+)\)', article_soup.find(class_='comment-button__text').text).group(1) ),
        'posting_url': [p.find(class_='posting_subject')['href'] for p in posts],
        'title': [p.find(class_='posting_subject').text.strip() for p in posts],
        'rating': [p.find(class_='tree_thread_list--rating').img for p in posts],
        'response_count': [p.find(class_='posting_count') for p in posts]
    })
    df.article_keywords = df.article_keywords.str.split(', ')
    df['posting_id'] = df.posting_url.str.extract(r'posting-(\d+)')
    df['posting_url'] = r'https://www.heise.de/forum/p-' + df.posting_id
    df['content'] = df.posting_url.map(load_posting_content)
    
    df.rating = df.rating.apply(parse_rating)
    print("downloading posting contents...")
    df.response_count = df.response_count.apply(parse_response_count)
    
    return df

    
df = load_postings(4952130)

print(df.shape)
print(df.dtypes)
df


In [None]:
df.to_excel('heise_postings_' + str(4952130) + '.xlsx')