# Wiki - Markdown Parser

Parses the markdown files located in `assets/data/wiki/original` and outputs the parsed files to `assets/data/wiki/converted` as JSON files, so that they can be used in the frontend.


#### Just some printing utilities

In [14]:
# prints in orange color
def print_warn(obj):
    print(f"\033[93m{obj}\033[0m")

# prints in green color
def print_success(obj):
    print(f"\033[92m{obj}\033[0m")

## Load the markdown files

Also parses the title from the markdown file

This expects a markdown H1 title at the top of the file

```md
# Title
```

In [15]:
import json
import os

markdownFilesPath = "./original"

# load all the markdown files (recursively) in this directory into a dictionary with the filename (without extension) as the key

validLanguages = ["en", "de"]

markdownFiles = []

for root, dirs, files in os.walk(markdownFilesPath):
    for file in files:
        if file.endswith(".md"):
            fileKey = os.path.splitext(os.path.join(root,file))[0].split("\\")[1:]
            fileKey = "/".join(fileKey)
            print(f"Found markdown file: {fileKey}")

            language = fileKey.split("/")[1]

            if language not in validLanguages:
                print_warn(f"\t⚠️ Warning: Language '{language}' is not valid. Skipping file {fileKey}")
                continue

            print_success(f"\t✅ Language: {language}")
            
            with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                markdownContent = f.read()

                # find first line containing a # and use it as the title
                title = None
                for line in markdownContent.splitlines():
                    if line.startswith("# "):
                        title = line[2:]  # Remove the "# " to get the title text
                        break

                if title:
                    print_success(f"\t✅ Title: {title}")
                else:
                    print_warn(f"\t⚠️ Warning: Title is missing in the markdown file {fileKey}")

                markdownFiles.append({
                    "slug": fileKey,
                    "content": markdownContent,
                    "title": title if title else "Untitled",
                    "language": language
                })
                


Found markdown file: App-Wiki-Articles/de/Fokus-halten
[92m	✅ Language: de[0m
[92m	✅ Title: Fokus halten [0m
Found markdown file: App-Wiki-Articles/de/Lernmethoden
[92m	✅ Language: de[0m
[92m	✅ Title: Lernmethoden[0m
Found markdown file: App-Wiki-Articles/de/Lernraum-und-Lernumgebung
[92m	✅ Language: de[0m
[92m	✅ Title: Lernraum und Lernumgebung [0m
Found markdown file: App-Wiki-Articles/de/Lerntyp-Test-Hinweise
[92m	✅ Language: de[0m
[92m	✅ Title: Hinweise zu deinem Ergebnis des  Lerntyp Tests[0m
Found markdown file: App-Wiki-Articles/de/Lerntypen
[92m	✅ Language: de[0m
[92m	✅ Title: Lerntypen[0m
Found markdown file: App-Wiki-Articles/de/Lernübersichten
[92m	✅ Language: de[0m
Found markdown file: App-Wiki-Articles/de/Prüfungsangst
[92m	✅ Language: de[0m
[92m	✅ Title: Wie geht man mit Prüfungsangst um?[0m
Found markdown file: App-Wiki-Articles/de/Lernmethoden/Aktives-Lesen
[92m	✅ Language: de[0m
[92m	✅ Title: Aktives Lesen[0m
Found markdown file: App-Wiki-

## Loads the slug to UUID mapping

The mapping is stored in uuids.json and ensures that the UUIDs are consistent

Also, if an article has no associated UUID, it will be discarded. This way, no orphaned articles will be present in the final data.

In [16]:
uuids = []

with open("uuids.json",encoding="utf-8") as uuidFile: 

    uuids = json.load(uuidFile)

if len(set(uuids.values())) != len(uuids.values()):
        print_warn("⚠️ WARNING: UUIDS contain duplicates")

else:
    print_success("✅ UUIDs successfully loaded")

[92m✅ UUIDs successfully loaded[0m


## Map the UUIDs to the articles and replace Markdown Links with UUIDs


- Prints warnings for any links that are not found. 
- Also parses the keywords from the markdown file from a html comment at the top of the file

```md
<!-- ["keyword1", "keyword2"] -->
```
- Prints warnings if no keywords for an article are found

In [17]:
import re

articles = []

for markdownFile in markdownFiles:
    slug = markdownFile['slug']
    content = markdownFile['content']
    title = markdownFile['title']
    language = markdownFile['language']

    # ignore pages that don't have an associated uuid in uuids.json
    if slug not in uuids:
        print_warn(f"\n⚠️ WARNING: MISSING UUID FOR {title}\n{slug}")
        continue
            
    uuid = uuids[slug]
    keywords = []

    print_success(f"\n✅ Found Entry {title}")
    print(f"\t[{slug}]({uuid})")

    # look for <!-- ["keyword1","keyword2"] --> and parse the list 
    keyword_comment_match = re.search(r'<!-- (\[.*?\]) -->',content)

    if keyword_comment_match:
        keyword_json = json.loads(keyword_comment_match.group(1))
        for keyword in keyword_json:
            keywords.append(keyword)
        print_success(f"\t✅ Found Keywords {keywords}")
    else:
        print_warn(f"\t⚠️ WARNING: MISSING KEYWORDS FOR {title}\n\t\t{slug}")

    # * replace links [Text](slug) with links [Text](uuid) to uuids
    for slug_, id_ in uuids.items():
        content = content.replace(f"({slug_})",f"({id_})")

    # * Find Markdown Links that have not yet been replaced with a Link to a UUID indicating an unresolved Link or typo in the path
    nonUUIDLink_matches = re.findall(r"\[(.*?)\]\((?!([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-4[0-9a-fA-F]{3}-[89aAbB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}))([^)]*)\)",content);

    for nonUUIDLink in nonUUIDLink_matches:
        print_warn(f"\t⚠️ WARNING: Found unresolved link {nonUUIDLink[0]} {nonUUIDLink[2]}")

    # add spaces between words "KörperlicheEbene" -> "Körperliche Ebene"
    if " " not in title:
        title = " ".join(re.findall('[A-Z][^A-Z]*', title))

    related_wikis = []
    online_resources = []

    articles.append({
        "id":uuid,
        "title":title,
        "slug":slug,
        "keywords":keywords,
        "markdown_content":content,
        "related_wikis":related_wikis,
        "online_resources":online_resources,
        "language":language,
    })

# print the articles as json
#print(json.dumps(articles, indent=4, ensure_ascii=False))


print_success(f"Found {len(articles)} articles")

[92m
✅ Found Entry Fokus halten [0m
	[App-Wiki-Articles/de/Fokus-halten](cc779f27-e1d4-48ab-b75a-2b7daaa67e50)
[92m	✅ Found Keywords ['Fokus halten', 'Lernmethoden'][0m
[92m
✅ Found Entry Lernmethoden[0m
	[App-Wiki-Articles/de/Lernmethoden](3a7372e1-943b-4de3-a700-bcf37043fffa)
[92m	✅ Found Keywords ['Lernen', 'Lernmethoden'][0m
[92m
✅ Found Entry Lernraum und Lernumgebung [0m
	[App-Wiki-Articles/de/Lernraum-und-Lernumgebung](9f07c546-657d-4bad-bd9a-7e847c4496d9)
[92m	✅ Found Keywords ['Lernraum und Lernumgebung ', 'Lerntypen'][0m
[92m
✅ Found Entry Hinweise zu deinem Ergebnis des  Lerntyp Tests[0m
	[App-Wiki-Articles/de/Lerntyp-Test-Hinweise](599b335c-bdbd-4af9-91f4-a37d5fe7dcfe)
[92m	✅ Found Keywords ['Lerntypen'][0m
[92m
✅ Found Entry Lerntypen[0m
	[App-Wiki-Articles/de/Lerntypen](662d5bbf-48b1-4e55-92b7-fbb98f26d609)
[92m	✅ Found Keywords ['Lernen', 'Lerntypen'][0m
[93m
App-Wiki-Articles/de/Lernübersichten[0m
[92m
✅ Found Entry Wie geht man mit Prüfungsangst 

## Export the data to JSON

In [18]:
export_folder_path = "./converted/"

# create the export directory if it doesn't exist
if not os.path.exists(export_folder_path):
        os.makedirs(export_folder_path)
        print_success(f"Created folder: {export_folder_path}")

for article in articles:
    slug = article['slug']
    print(f"Exporting {slug}")

    # replace spaces with underscores in file names
    filename = f"{export_folder_path}{slug.replace(' ','_').replace('/','.')}.json"
    # replace umlaute
    filename = filename.lower().replace("ä","ae").replace("ü","ue").replace("ö","oe")
        # override or create output file
    with open(filename,"w",encoding="utf-8") as file:
        file.write(json.dumps(article))

        print_success(f"\t✅ Created or changed file {file.name}\n")

Exporting App-Wiki-Articles/de/Fokus-halten
[92m	✅ Created or changed file ./converted/app-wiki-articles.de.fokus-halten.json
[0m
Exporting App-Wiki-Articles/de/Lernmethoden
[92m	✅ Created or changed file ./converted/app-wiki-articles.de.lernmethoden.json
[0m
Exporting App-Wiki-Articles/de/Lernraum-und-Lernumgebung
[92m	✅ Created or changed file ./converted/app-wiki-articles.de.lernraum-und-lernumgebung.json
[0m
Exporting App-Wiki-Articles/de/Lerntyp-Test-Hinweise
[92m	✅ Created or changed file ./converted/app-wiki-articles.de.lerntyp-test-hinweise.json
[0m
Exporting App-Wiki-Articles/de/Lerntypen
[92m	✅ Created or changed file ./converted/app-wiki-articles.de.lerntypen.json
[0m
Exporting App-Wiki-Articles/de/Prüfungsangst
[92m	✅ Created or changed file ./converted/app-wiki-articles.de.pruefungsangst.json
[0m
Exporting App-Wiki-Articles/de/Lernmethoden/Aktives-Lesen
[92m	✅ Created or changed file ./converted/app-wiki-articles.de.lernmethoden.aktives-lesen.json
[0m
Exporti