# Helpers

In [13]:
import urllib.parse

def get_subdomain(url):
    netloc = urllib.parse.urlparse(url).netloc
    
    return ".".join(netloc.split(".")[:-2])

def get_pathname(url):
    return urllib.parse.urlparse(url).path

assert get_subdomain("https://de.serlo.org/mathe/1515/satz-des-pythagoras") == "de"
assert get_pathname("https://de.serlo.org/mathe/1515/satz-des-pythagoras") == "/mathe/1515/satz-des-pythagoras"

In [24]:
from IPython.display import display, HTML

def display_html(html):
    display(HTML(html))

In [20]:
import requests

def api_call(query, variables={}):
        response = requests.post("https://api.serlo.org/graphql",
            headers = {
                "Content-Type": "application/json",
            },
            json = { "query": query, "variables": variables }
        )
        
        if response.status_code not in [200,400]:
            response.raise_for_status()
        
        result = response.json()
        
        if "errors" in result and result["errors"]:
            raise requests.HTTPError(result["errors"][0]["message"], response=response)
        
        return result["data"]

def get_content_of_article(article_path):
    variables = {
        "alias": {
            "instance": get_subdomain(article_path),
            "path": get_pathname(article_path)
        }
    }
    
    response = api_call("""
        query($alias: AliasInput!) {
            uuid(alias: $alias) {
               ...on Article {
                   currentRevision {
                       content
                   }
               }
            }
        }
    """, variables=variables)
    
    return response["uuid"]["currentRevision"]["content"]

get_content_of_article("https://de.serlo.org/mathe/1515/satz-des-pythagoras")

'{"plugin":"article","state":{"introduction":{"plugin":"articleIntroduction","state":{"explanation":{"plugin":"text","state":[{"type":"p","children":[{"text":"Der Satz des Pythagoras stellt eine Beziehung zwischen den Seitenlängen eines "},{"type":"a","href":"/36070","children":[{"text":"rechtwinkligen Dreiecks"}]},{"text":" her:"}]},{"type":"p","children":[{"text":"Die Summe der quadrierten  "},{"type":"a","href":"/1543","children":[{"text":"Katheten"}]},{"text":" ("},{"text":"a","strong":true},{"text":" und "},{"text":"b","strong":true},{"text":") ist gleich dem Quadrat der "},{"type":"a","href":"/1541","children":[{"text":"Hypotenuse"}]},{"text":" ("},{"text":"c","strong":true},{"text":")."}]}]},"multimedia":{"plugin":"image","state":{"src":"https://assets.serlo.org/legacy/577f75f2ab03d_4b70ec4721a13bba428d3a8808ff255dab9a1375.png","alt":"Pythagoras - Dreieck u Formel","caption":{"plugin":"text","state":[{"type":"p","children":[{}]}]}}},"illustrating":true,"width":50}},"content":{"p

# Get fulltext from JSON content

In [25]:
import json

def get_fulltext(content):
    try:
        parsed_content = json.loads(content)
    except e:
        # All of our content is SON
        return ""
    
    return " ".join(get_text_contents(parsed_content))

def get_text_contents(content):
    if isinstance(content, dict):
        if "text" in content and isinstance(content["text"], str):
            yield content["text"]
        
        for value in content.values():
            yield from get_text_contents(value)
    elif isinstance(content, list):
        for value in content:
            yield from get_text_contents(value)

article_content = get_content_of_article("https://de.serlo.org/mathe/1515/satz-des-pythagoras")
fulltext = get_fulltext(article_content)

display_html("<p><code>" + fulltext + "</code></p>")