In [9]:
import json
import requests

SEARCH_URL="https://repository.staging.openeduhub.net/edu-sharing/rest/search/v1/queries/-home-/mds_oeh/ngsearch/lrmi"

def pretty_print(data):
    return print(json.dumps(data, indent=2))

def search_lrmi(skipCount=0, maxItems=1):
    response = requests.post(SEARCH_URL, params={
        "maxItems": maxItems,
        "skipCount": skipCount,
        "propertyFilter": '-all-'
    }, headers={
        "Content-Type": "application/json",
        "accept": "application/json"
    }, json={
      "criteria": [
        {
          "property": "ccm:replicationsource",
          "values": [
            "serlo_spider"
          ]
        }
      ]
    })
    
    nodes = response.json()["nodes"]
    
    return [json.loads(node) for node in nodes]

pretty_print(search_lrmi(skipCount=1))

[
  {
    "identifier": "a8f3b03d-ea3b-480c-823f-50bf1c16f870",
    "keywords": [
      "Community",
      "Sandkasten"
    ],
    "@type": [
      "CreativeWork",
      "MediaObject"
    ],
    "description": "erstens: schreibe etwaszweitens: mach ausversehen einen nicht gesch\u00fctzten zeilenumbruchzweitens: mach ausversehen einen nicht gesch\u00fctzten zeilenumbruchdrittens:l\u00f6sche den zeilenumbruchviertens:dr\u00fccke enter\ufeffergebnis:eine zeile wird kopiert",
    "dateModified": "2020-07-22T11:00:34Z",
    "@context": "http://schema.org/",
    "ContentSize": "401",
    "version": "1.0",
    "url": "https://repository.staging.openeduhub.net/edu-sharing/components/render/a8f3b03d-ea3b-480c-823f-50bf1c16f870",
    "sourceUrl": "https://de.serlo.org/community/sandkasten/zeilenglitch?contentOnly",
    "license": "https://creativecommons.org/licenses/by-sa/4.0/deed.en",
    "dateCreated": "2020-07-15T15:53:49Z",
    "name": "Zeilenglitch",
    "thumbnailUrl": "https://repository

In [12]:
import time

def load_serlo_lrmis():
    skipCount = 0
    
    while True:
        results = search_lrmi(skipCount=skipCount, maxItems=300)
        
        yield from results
        
        skipCount += len(results)
        
        if len(results) < 1:
            break

start_time = time.time()

lrmis = list(load_serlo_lrmis())

end_time = time.time()

execution_time = end_time - start_time

print("Execution time:", execution_time, "seconds")
print(f"Number of fetched elements: {len(lrmis)}")

Execution time: 774.1822485923767 seconds
Number of fetched elements: 9781


In [22]:
def get_keywords(uuid):
    return get_keywords_by_url(f"https://serlo.org/{uuid}")

def get_keywords_by_url(url):
    response = requests.post(SEARCH_URL, params={
        "maxItems": 1,
        "skipCount": 0,
        "propertyFilter": '-all-'
    }, headers={
        "Content-Type": "application/json",
        "accept": "application/json"
    }, json={
      "criteria": [
        {
          "property": "ccm:wwwurl",
          "values": [
            url
          ]
        }
      ]
    })
    
    nodes = response.json()["nodes"]
    
    if len(nodes) == 0:
        return None
    
    return json.loads(nodes[0]).get("keywords", [])

print(get_keywords(1515))
print(get_keywords(1707))
print(get_keywords(120925))
print(get_keywords(102789))
print(get_keywords_by_url("https://de.serlo.org/nachhaltigkeit/102789/plastik"))

[]
[]
[]
None
['Plastik', 'Müll']


In [24]:
pretty_print(lrmis[42])

{
  "identifier": "47a55896-817a-4383-b0d7-63e8d638ab04",
  "keywords": [
    "Verarbeitung",
    "Permakultur und Urbane G\u00e4rten",
    "Ernte und Verarbeitung",
    "Nachhaltigkeit"
  ],
  "@type": [
    "CreativeWork",
    "MediaObject"
  ],
  "description": "Kastanien Rosskastanien werden fein zerkleinert oder zerquetscht und in einem Stoffbeutel mit der W\u00e4sche gewaschen. Seifenkraut Seifenkraut findet sich an B\u00f6schungen von Fl\u00fcssen und Teichen. Seine Wurzeln enthalten Saponine, aus denen sich eine Waschlauge herstellen l\u00e4sst. Daf\u00fcr werden 15 ...",
  "dateModified": "2020-07-22T14:01:27Z",
  "@context": "http://schema.org/",
  "ContentSize": "701",
  "version": "1.0",
  "url": "https://repository.staging.openeduhub.net/edu-sharing/components/render/47a55896-817a-4383-b0d7-63e8d638ab04",
  "sourceUrl": "https://de.serlo.org/permakultur/verwertung-veredelung-ernteertraegen/verarbeitung/weitere-produkte/waschmittel?contentOnly",
  "license": "https://creati

In [33]:
import re

def get_uuid(url):
    try:
        response = requests.get(url.rstrip("?contentOnly"), timeout=60)
    except requests.exceptions.ReadTimeout:
        return None

    if not response.ok:
        return None

    pattern = r'<script type="application/ld\+json">(.*?)</script>'
    matches = re.findall(pattern, response.text, re.DOTALL)

    if len(matches) == 0:
        return None

    match = matches[0]

    try:
        data = json.loads(match.strip())
        
        if "id" in data and isinstance(data["id"], str):
            return data["id"]
    except (TypeError, json.JSONDecodeError):
        pass
    
get_uuid("https://de.serlo.org/community/sandkasten/zeilenglitch?contentOnly")
get_uuid("https://de.serlo.org/1565")

'https://serlo.org/1565'

In [35]:
len([l for l in lrmis if "keywords" in l])

101