In [None]:
import glob
import pypandoc
import logging

In [None]:
MD_EXTENSION = ".md"
HTML_EXTENSION = ".html"
PDF_EXTENSION = ".pdf"
GLOB_MD_DIR = "../"
LUA_FILTERS_DIR = "/Users/rishi/lua-filters/"
SPELLCHECK_FILTER = LUA_FILTERS_DIR + "spellcheck/spellcheck.lua"
WORDCOUNT_FILTER = LUA_FILTERS_DIR + "wordcount/wordcount.lua"

# Suppress pandoc logs
# logging.getLogger('pypandoc').addHandler(logging.NullHandler())

In [None]:
def convert_md_to_html(
    md_dir=GLOB_MD_DIR, add_toc=False, sidenote_filter=True, mermaid_filter=False
):
    """
    Converts markdown files to html in specified directory

    :param md_dir: the directory with *.md files
    :param add_toc: whether to add a table of contents
    :param convert_mermaid: whether to convert mermaid UML to images in the html
    :return:
    """
    md_files = glob.iglob(md_dir + "**/[!README]*" + MD_EXTENSION, recursive=True)

    pandoc_filters = []
    if sidenote_filter:
        pandoc_filters.append("pandoc-sidenote")
    if mermaid_filter:
        pandoc_filters.append("mermaid-filter")

    lua_filters = []
    if spellcheck_filter:
        lua_filters.append("")

    # See https://pandoc.org/MANUAL.html#options
    pandoc_args = [
        "--katex",
        "--section-divs",
        "--css=../../src/styles/tufte.css",
        "--css=../../src/styles/pandoc.css",
        "--css=../../src/styles/pandoc-solarized.css",
        "--css=../../src/styles/tufte-extra.css",
        "--template=templates/tufte.html5",
        "--extract-media=media",
        "--lua-filter=/Users/rishi/lua-filters/wordcount/wordcount.lua",
    ]
    if add_toc:
        pandoc_args.append(["--toc", "--variable", "toc-title:Table of Contents"])

    # See: https://pandoc.org/MANUAL.html#extensions
    pandoc_format = (
        "markdown"
        + "+smart"
        + "+pandoc_title_block"
        + "+fenced_divs"
        + "+line_blocks"
        + "+fenced_code_blocks"
        + "+backtick_code_blocks"
        + "+fenced_code_attributes"
        + "+inline_code_attributes"
        + "+link_attributes"
        + "+startnum"
        + "+fancy_lists"
        + "+task_lists"
        + "+definition_lists"
        + "+example_lists"
        + "+table_captions"
        + "+simple_tables"
        + "+multiline_tables"
        + "+grid_tables"
        + "+pipe_tables"
        + "+emoji"
        + "+intraword_underscores"
        + "+strikeout"
        + "+superscript"
        + "+subscript"
        + "+tex_math_dollars"
        + "+implicit_figures"
        + "+footnotes"
        + "+inline_notes"
    )

    for md_file in md_files:
        html_file = md_file[: -len(MD_EXTENSION)] + HTML_EXTENSION
        pypandoc.convert_file(
            md_file,
            extra_args=pandoc_args,
            format=pandoc_format,
            filters=pandoc_filters,
            to="html5+smart",
            outputfile=html_file,
        )

In [None]:
def spellcheck(md_dir=GLOB_MD_DIR):
    """
    Spell checks all *.md files in given directory
    :param md_dir: directory to search
    """
    md_files = glob.iglob(md_dir + "**/[!README]*" + MD_EXTENSION, recursive=True)
    for file in md_files:
        print(f"Spellchecking {file}...")
        !pandoc --lua-filter {SPELLCHECK_FILTER} {file}
        print("\n")

In [None]:
def wordcount(md_dir=GLOB_MD_DIR):
    """
    Prints word count for all *.md files in given directory
    :param md_dir: directory to search
    """
    md_files = glob.iglob(md_dir + "**/[!README]*" + MD_EXTENSION, recursive=True)
    for file in md_files:
        print(f"Word count for {file}...")
        !pandoc --lua-filter {WORDCOUNT_FILTER} {file}
        print("\n")

In [None]:
convert_md_to_html("../articles/citi-bike/")

In [None]:
spellcheck()

In [None]:
wordcount()

In [None]:
!pandoc --lua-filter /Users/rishi/lua-filters/wordcount/wordcount.lua /Users/rishi/blogs/articles/citi-bike/citibike-trips.md

In [None]:
break

In [None]:
md_files = glob.iglob(GLOB_MD_DIR + "**/[!README]*" + MD_EXTENSION, recursive=True)

In [None]:
# debug
if False:
    for f in md_files:
        print(f)

In [None]:
pandoc_filters = [
    "pandoc-sidenote",
    # "mermaid-filter"
]

# See https://pandoc.org/MANUAL.html#options
pandoc_args = [
    "--katex",
    "--section-divs",
    "--toc",
    # "--css=../../src/styles/pandoc.css",
    "--css=../../src/styles/tufte.css",
    "--css=../../src/styles/pandoc2.css",
    "--css=../../src/styles/pandoc-solarized.css",
    "--css=../../src/styles/tufte-extra.css",
    # "--template=templates/pandoc-template.html",
    "--template=templates/tufte.html5",
    "--extract-media=media",
    "--variable",
    "toc-title:Table of Contents",
]

# See: https://pandoc.org/MANUAL.html#extensions
pandoc_format = (
    "markdown"
    + "+smart"
    + "+pandoc_title_block"
    + "+fenced_divs"
    + "+line_blocks"
    + "+fenced_code_blocks"
    + "+backtick_code_blocks"
    + "+fenced_code_attributes"
    + "+inline_code_attributes"
    + "+link_attributes"
    + "+startnum"
    + "+fancy_lists"
    + "+task_lists"
    + "+definition_lists"
    + "+example_lists"
    + "+table_captions"
    + "+simple_tables"
    + "+multiline_tables"
    + "+grid_tables"
    + "+pipe_tables"
    + "+emoji"
    + "+intraword_underscores"
    + "+strikeout"
    + "+superscript"
    + "+subscript"
    + "+tex_math_dollars"
    + "+implicit_figures"
    + "+footnotes"
    + "+inline_notes"
)

In [None]:
for md_file in md_files:
    html_file = md_file[: -len(MD_EXTENSION)] + HTML_EXTENSION
    pypandoc.convert_file(
        md_file,
        extra_args=pandoc_args,
        format=pandoc_format,
        filters=pandoc_filters,
        to="html5+smart",
        outputfile=html_file,
    )

In [None]:
# delete generated html files. for debugging
if False:
    for html_file in glob.iglob(
        "../**/[!pandoc-template]*" + HTML_EXTENSION, recursive=True
    ):
        os.remove(html_file)

In [None]:
html_files = glob.iglob("../**/*" + HTML_EXTENSION, recursive=True)

In [None]:
if False:
    for f in html_files:
        print(f)

In [None]:
for html_file in html_files:
    pdf_file = html_file[: -len(HTML_EXTENSION)] + PDF_EXTENSION
    pypandoc.convert_file(
        html_file,
        extra_args=[
            "--pdf-engine=xelatex",
            "--variable",
            "lang=en",
            "--variable",
            "toc-title:Table of Contents",
        ],
        to="latex",
        outputfile=pdf_file,
    )