In [None]:
import glob
import os

import pypandoc
import logging

In [None]:
MD_EXTENSION = ".md"
HTML_EXTENSION = ".html"
PDF_EXTENSION = ".pdf"
PROJECTS_DIR = "../projects/"
ARTICLES_DIR = "../articles/"
LUA_FILTERS_DIR = "/Users/rishi/lua-filters/"
SPELLCHECK_FILTER = LUA_FILTERS_DIR + "spellcheck/spellcheck.lua"
WORDCOUNT_FILTER = LUA_FILTERS_DIR + "wordcount/wordcount.lua"

# Suppress pandoc logs
# logging.getLogger('pypandoc').addHandler(logging.NullHandler())

In [None]:
def spellcheck(md_dir=ARTICLES_DIR):
    """
    Prints spelling errors in all *.md files in given directory
    :param md_dir: directory to search
    """
    md_files = glob.iglob(md_dir + "**/[!README]*" + MD_EXTENSION, recursive=True)
    for file in md_files:
        print(f"Spellchecking {file}...")
        spelling_errors = !pandoc --lua-filter {SPELLCHECK_FILTER} {file}
        print(spelling_errors)
        print("\n")

In [None]:
def wordcount(md_dir=ARTICLES_DIR):
    """
    Prints word count for all *.md files in given directory
    :param md_dir: directory to search
    """
    md_files = glob.iglob(md_dir + "**/[!README]*" + MD_EXTENSION, recursive=True)
    for file in md_files:
        print(f"Word count for {file}...")
        wc = !pandoc --lua-filter {WORDCOUNT_FILTER} {file}
        print(wc)
        print("\n")

In [None]:
def convert_md_to_html(
    md_dir=ARTICLES_DIR,
    add_toc=False,
    sidenote_filter=True,
    mermaid_filter=True,
    plot_filter=True,
    crossreference_filter=False,
):
    """
    Converts markdown files to html in specified directory

    :param md_dir: the directory with *.md files
    :param add_toc: whether to add a table of contents
    :param sidenote_filter: use pandoc-sidenote to convert footnotes to sidenotes
    :param mermaid_filter: use mermaid-filter to convert mermaid.js markdown to images
    :param plot_filter: use pandoc-plot to convert plot code to images. only tested with matplotlib. script fails for plotly_python
    :param crossreference_filter: use pandoc-crossref to allow referencing sections of the article elsewhere in the article
    :return:
    """
    md_files = glob.iglob(md_dir + "**/[!README]*" + MD_EXTENSION, recursive=True)

    pandoc_filters = []

    # Converts footnotes to sidenotes
    if sidenote_filter:
        pandoc_filters.append("pandoc-sidenote")
    if mermaid_filter:
        pandoc_filters.append("mermaid-filter")
    # Usage: https://laurentrdc.github.io/pandoc-plot/MANUAL.html
    if plot_filter:
        pandoc_filters.append("pandoc-plot")

    # Usage: https://lierdakil.github.io/pandoc-crossref/
    # consider using https://github.com/tomduck/pandoc-xnos instead of pandoc-crossref
    if crossreference_filter:
        pandoc_filters.append("pandoc-crossref")

    # wordcount and spellcheck are done outside this function
    # consider adding https://github.com/pandoc/lua-filters/tree/master/diagram-generator
    lua_filters = [
        "--lua-filter",
        LUA_FILTERS_DIR + "include-code-files/include-code-files.lua",
        "--lua-filter",
        LUA_FILTERS_DIR + "include-files/include-files.lua",
    ]

    # See https://pandoc.org/MANUAL.html#options
    pandoc_args = [
        "--katex",
        "--section-divs",
        "--css=../../src/styles/tufte.css",
        "--css=../../src/styles/pandoc.css",
        "--css=../../src/styles/pandoc-solarized.css",
        "--css=../../src/styles/tufte-extra.css",
        "--template=../../src/templates/tufte.html5",
        "--extract-media=media",
    ]
    if add_toc:
        pandoc_args.append(["--toc", "--variable", "toc-title:Table of Contents"])

    pandoc_args += lua_filters

    # See: https://pandoc.org/MANUAL.html#extensions
    pandoc_format = (
        "markdown"
        + "+smart"
        + "+pandoc_title_block"
        + "+fenced_divs"
        + "+line_blocks"
        + "+fenced_code_blocks"
        + "+backtick_code_blocks"
        + "+fenced_code_attributes"
        + "+inline_code_attributes"
        + "+link_attributes"
        + "+startnum"
        + "+fancy_lists"
        + "+task_lists"
        + "+definition_lists"
        + "+example_lists"
        + "+table_captions"
        + "+simple_tables"
        + "+multiline_tables"
        + "+grid_tables"
        + "+pipe_tables"
        + "+emoji"
        + "+intraword_underscores"
        + "+strikeout"
        + "+superscript"
        + "+subscript"
        + "+tex_math_dollars"
        + "+implicit_figures"
        + "+footnotes"
        + "+inline_notes"
    )

    cwd = os.getcwd()
    for md_file in md_files:
        html_file = os.path.basename(md_file[: -len(MD_EXTENSION)] + HTML_EXTENSION)

        os.chdir(os.path.dirname(md_file))

        pypandoc.convert_file(
            "./" + os.path.basename(md_file),
            extra_args=pandoc_args,
            format=pandoc_format,
            filters=pandoc_filters,
            to="html5+smart",
            outputfile=html_file,
        )

        # remove useless mermaid-filter.err files
        if os.path.exists("mermaid-filter.err"):
            os.remove("mermaid-filter.err")

        # change back to original directory for next loop
        os.chdir(cwd)

In [None]:
convert_md_to_html(md_dir=ARTICLES_DIR)
convert_md_to_html(md_dir=PROJECTS_DIR)

In [None]:
spellcheck(ARTICLES_DIR)
spellcheck(PROJECTS_DIR)

In [None]:
wordcount(ARTICLES_DIR)
wordcount(PROJECTS_DIR)

In [None]:
def convert_md_to_pdf(md_dir=ARTICLES_DIR):
    """
    Converts md files to PDF in a given directory.
    TODO: Does not handle emojis
    TODO: Does not handle pandoc-plot images

    :param md_dir: the directory where html files reside
    :return:
    """
    pandoc_args = [
        "--katex",
        "--section-divs",
        "--extract-media=media-pdf",
        "--pdf-engine=xelatex",
        "--variable",
        "lang=en",
        "--toc",
        "--variable",
        "toc-title:Table of Contents",
        # "--variable",
        # "mainfont='DejaVu Sans'",
    ]

    pandoc_format = (
        "markdown"
        + "+smart"
        + "+pandoc_title_block"
        + "+fenced_divs"
        + "+line_blocks"
        + "+fenced_code_blocks"
        + "+backtick_code_blocks"
        + "+fenced_code_attributes"
        + "+inline_code_attributes"
        + "+link_attributes"
        + "+startnum"
        + "+fancy_lists"
        + "+task_lists"
        + "+definition_lists"
        + "+example_lists"
        + "+table_captions"
        + "+simple_tables"
        + "+multiline_tables"
        + "+grid_tables"
        + "+pipe_tables"
        + "+emoji"
        + "+intraword_underscores"
        + "+strikeout"
        + "+superscript"
        + "+subscript"
        + "+tex_math_dollars"
        + "+implicit_figures"
        + "+footnotes"
        + "+inline_notes"
    )

    md_files = glob.iglob(md_dir + "/**/*" + MD_EXTENSION, recursive=True)

    for md_file in md_files:
        pdf_file = md_file[: -len(MD_EXTENSION)] + PDF_EXTENSION
        pypandoc.convert_file(
            md_file,
            extra_args=pandoc_args,
            format=pandoc_format,
            to="latex+smart",
            outputfile=pdf_file,
        )

In [None]:
convert_md_to_pdf(ARTICLES_DIR + "test/")
convert_md_to_pdf(PROJECTS_DIR + "citibike/")