# Purpose

The purpose of this application is to publish the YAML formatted documents

Output formats:

* Pandoc
* HTML
* <s>PDF</s>
* CommonMark

# Discussion


References

* [PyYAML](https://pyyaml.org/)
* [pandoc](https://pandoc.org/)
  * [TeX Live](https://www.tug.org/texlive/) which is needed to [create a PDF](https://pandoc.org/MANUAL.html#creating-a-pdf)

# Install

Anything that needs installed to use this should be listed here

``` shell
#PyYAML
sudo -H pip3 install pyyaml

#Pandoc
sudo apt install pandoc
sudo apt install texlive-latex-recommended
```

# Application

## Configuration

In [128]:
#these are the base dirs everthing works from:
srcDir="./src"
dstDir="./docs"

#The node names are based on the [Pandoc Supported Formats](https://github.com/tajmone/markdown-guide/tree/master/pandoc#pandoc-supported-formats)
#the `dstPath` likely could go away as it was based upon a different idea for how the files should be stored
#this is a dictionary of dictionaries.  To access data: `outputConfig['yaml']['ext']`
outputConfig = {
    'yaml': {
        'ext': '.yaml', 
        'dstPath': dstDir
    },
    'markdown': { #Pandoc MarkDown varient
        'ext': '.pandoc', 
        'dstPath': dstDir
    },
    'markdown_strict': { #CommonMark spec
        'ext': '.md', 
        'dstPath': dstDir
    },
    'html5': {
        'ext': '.html', 
        'dstPath': dstDir
    },
    'pdf': {
        'ext': '.pdf', 
        'dstPath': dstDir
    }
}
#outputConfig

## function definitions

### find()

In [129]:
#find files that match the pattern

#these functions came from:
#https://stackoverflow.com/questions/1724693/find-a-file-in-python

"""
#this will find the first match
import os
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

#this will find all matches
import os
def find_all(name, path):
    result = []
    for root, dirs, files in os.walk(path):
        if name in files:
            result.append(os.path.join(root, name))
    return result
"""

#this will match a pattern with wildcards
"""
import os, fnmatch
def find(pattern, path):
    result = []
    for root, dirs, files in os.walk(path):
        for name in files:
            print (name)
            if fnmatch.fnmatch(name, pattern): #this supports wildcards
                result.append(os.path.join(root, name))
    return result
"""
#this just does a substring match
"""
import os
def find(pattern, path):
    result = []
    for root, dirs, files in os.walk(path):
        #print (root, dirs, files, '\n')
        for file in files:
            name = os.path.join(root, file)
            if pattern in name: #this is exact substring match
                #print (name)
                result.append(name)
    return result
"""
#this gives full regex
import os, re
def find(pattern, path):
    result = []
    for root, dirs, files in os.walk(path):
        #print (root, dirs, files, '\n')
        for file in files:
            name = os.path.join(root, file)
            m = re.search (pattern, name) 
            if m: 
                #print (name)
                result.append(name)
    return result

#find('*.txt', '/path/to/dir')

### findDir()

In [130]:
import os
def findDir(pattern, path):
    result = []
    for root, dirs, files in os.walk(path):
        #print (root, dirs, files, '\n')
        if pattern in root: #this is exact substring match
            #print (root)
            result.append(root)
    return result

### file operations

#### mkdir_p() and safe_open_w()

In [131]:
#the following is from:
#https://stackoverflow.com/questions/23793987/write-file-to-a-directory-that-doesnt-exist

import os, os.path
import errno

# Taken from https://stackoverflow.com/a/600612/119527
def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc: # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise

def safe_open_w(path):
    ''' Open "path" for writing, creating any parent directories as needed.
    '''
    mkdir_p(os.path.dirname(path))
    return open(path, 'w')

#with safe_open_w('/Users/bill/output/output-text.txt') as f:
#    f.write(...)

### yaml2python()

In [132]:
#this will convert a yaml file to a python data structure.

import yaml

def yaml2python (file):
    #print (file)

    f = open (file, 'r') #open the file to read
    content = yaml.load(f.read(), Loader=yaml.FullLoader) #convert the yaml contents into a python data structure
    f.close() #close the file
    
    #print (content)
    
    return content

### is_valid_uuid()

In [133]:
#From:  https://stackoverflow.com/questions/19989481/how-to-determine-if-a-string-is-a-valid-v4-uuid

from uuid import UUID

def is_valid_uuid(uuid_to_test, version=4):
    """
    Check if uuid_to_test is a valid UUID.

    Parameters
    ----------
    uuid_to_test : str
    version : {1, 2, 3, 4}

    Returns
    -------
    `True` if uuid_to_test is a valid UUID, otherwise `False`.

    Examples
    --------
    >>> is_valid_uuid('c9bf9e57-1685-4c89-bafb-ff5af830be8a')
    True
    >>> is_valid_uuid('c9bf9e58')
    False
    """
    try:
        uuid_obj = UUID(uuid_to_test, version=version)
    except ValueError:
        return False

    return str(uuid_obj) == uuid_to_test

### generateDocumentBody()

In [134]:
#################################################
#look for MD header levels and increase them by the indicated amount
import re
def increaseHeaderLevel (mdText, addlLevels):
    if addlLevels > 0:
        string = "#"
        
        #add the appropriate number of `#`s to the string
        x = 1
        while x < addlLevels:
            string = string + '#'
            x += 1
            
        mdText = re.sub ('^#', string, mdText, flags=re.MULTILINE) #substitute the old string with the new one
    return (mdText)

#################################################
#walk and print the tree within documentBody
# results    : the data to be returned
# srcData    : the source data
# docID      : the UUID of the document
# depth      : how many header levels to add
import re
def generateDocumentBody (srcData, docID, depth):
    results = ""
    
    if isinstance(srcData, list):
        print ('List detected')
        
        dirList = findDir(docID, './src')
        #print (dirList)
        
        for element in srcData:
            if isinstance(element, list) or isinstance(element, dict):
                print ('Structure detected')
                #walk the next level down and increase the header depth
                results += generateDocumentBody (element, docID, depth+1)
                
            #elif '.md' in element:
            elif re.search('.md$', element, flags=re.IGNORECASE):
                print ('markdown: ' + element)

                #read in the content
                fnameList = find (element, dirList[0]) #search for the file in this doc's dir tree
                
                if len(fnameList) == 0:
                    print ('no files found')
                    continue #skip to the next file
                    
                print (fnameList)
                try:
                    f = open (fnameList[0], 'r')
                    #read in the MD file.  Search for all `#` headers and decrease them as indicated by `indent`
                    results += increaseHeaderLevel (f.read(), depth)
                    f.close()
                except OSError:
                    print ("Could not open/read file:", fnameList[0])
                    #sys.exit()
                    
            #elif '.yaml' in element:
            elif re.search('.yaml$', element, flags=re.IGNORECASE):
                print ('YAML: ' + element)
                #process the file
            elif is_valid_uuid(element):
                print ('UUIDv4: ' + element)
                #go find the file and then process the file
            else: #No idea what this is. Assuming its MD text
                print ('unknown: ' + element)
                results += element #assuming that if you are manually entering MD then you can control the formatting too
            results += '\n\n'
            
    elif isinstance(srcData, dict):
        print ('Dict detected')
        
    else:
        #print ('markdown:  no structure')
        results += srcData
        
    results += '\n\n'
    return (results)

### generatePandocContents()

In [135]:
#the pandoc file is the 'universal' format that all others will be generated from
#`data` = the full python data structure object

def generatePandocContents (data):
    document = "" #start with a blank document
    
    #################################################
    #Pandoc’s Markdown [Metadata blocks](https://pandoc.org/MANUAL.html#metadata-blocks)
    #Extension: pandoc_title_block
    
    keys=[
        "title",
        "author" #pretending that author will never be more then 1 entry
    ]
    for key in keys:
        if data.get(key) != None: #from [here](https://thispointer.com/python-how-to-check-if-a-key-exists-in-dictionary/)
            document += '% ' + data[key] + '\n'
    
    date=""
    for element in data['revision']:  #step through the list elements
        #print (element['date'])
        if date < element['date']: #REPLACE THIS with something that understands full ISO date/time formatted data!
            date = element['date']
    document += '% ' + date + '\n' #output the final result
    
    #################################################
    #Pandoc’s Markdown [Metadata blocks](https://pandoc.org/MANUAL.html#metadata-blocks)
    #Extension: yaml_metadata_block

    # not entirely sure if this replaces the title block or not
    
    document += "\n" #pandoc expects a blank line before this section
    document += "---\n" #this is required to start the yaml metadata block
    #dump the following info:
    keys=[
        "title",
        "author",
        "abstract",
        "lang"
    ]
    for key in keys:
        if data.get(key) != None: #from [here](https://thispointer.com/python-how-to-check-if-a-key-exists-in-dictionary/)
            document += '"' + key + '": ' + yaml.dump(data[key], default_style='"') #output format: `"key": "value"\n`
    
    # need to deal with "keywords" specifically    
    
    document += "...\n\n" #this is required to end the yaml metadata block

    #################################################
    
    document = generateDocumentBody (data['body'], data['id'], 0) #work through the possible formats and correctly print
    
    document += "\n"
    
    #################################################
    
    #print the revisions as a table in HTML because CommonMark doesnt do that
    document += "---\n\n"
    document += "# Revision history\n\n"
    
    #table style
    document += "<style>\n"
    document += "table { border-collapse: collapse; width: 100%; }\n"
    document += "td, th { border: 1px solid #dddddd; text-align: left; padding: 8px; }\n"
    document += "</style>\n"
    document += "\n"
    
    #Table layout:  "date", "status", "name", "reason"
    document += "<table>\n"
    document += "<tr>\n"
    document += "<th>Date</th>\n"
    document += "<th>Name</th>\n"
    document += "<th>Reason</th>\n"
    document += "</tr>\n"
    #the 'revisions' key consists of a list of sets
    #`content['revision'][0]['date']` will provide that specific data element
    for element in data['revision']:  #step through the list elements
        document += "<tr>\n"
        #now print the fields
        document += "<td>" + element['date'] + "</td>\n"
        document += "<td>" + element['name'] + "</td>\n"
        document += "<td>" + element['reason'] + "</td>\n"
        document += "</tr>\n"
    document += "</table>\n"
    
    #################################################
    #we are done generating the document contents
    
    #print (document)
    return (document)    

### convert2format()

In [136]:
#convert the pandoc formatted text into the specified format

import subprocess

def convert2format(srcFile, dstFile, outputFormat):
    if (outputFormat == 'html5') or (outputFormat == 'markdown_strict'):
        #subprocess.run(['pandoc', content['id']+'.pandoc', '--html-q-tags', '-s', '-o', content['id']+'.html'])
        subprocess.run(['pandoc', srcFile, '-s', '--html-q-tags', '-f', 'markdown+yaml_metadata_block+pandoc_title_block', '-t', outputFormat, '-o', dstFile])
    elif outputFormat == 'pdf':
        pass #the pdf conversion is generating font errors
        #print ('Error:  PDF format is currently broken')
        #subprocess.run(['pandoc', srcFile, '-s', '--html-q-tags', '-t', outputFormat, '-o', dstFile])
    elif (outputFormat != 'markdown') and (outputFormat != 'yaml'): #error on all but these which we will quietly ignore
        print ('Error:  ' + outputFormat + ' is not supported')

## main program

### find the files that need to be processed

In [137]:
#look for all the yaml files

#srcFiles = find('*' + outputConfig['yaml']['ext'], srcDir)
srcFiles = find(outputConfig['yaml']['ext'], srcDir)
#srcFiles

### process each file

For simplicity, this is using the premise that each YAML document is compiled to an output dir based upon its ID.  The reason for this is to avoid orphan files from accumulating.  At some later point, the process should look for only what changed and adjust the specific destination dir accordingly.

This is the document summary data structure:

``` python
summaryDataDemo = {
    'type': { #`type` field: key index
        '1928374': { #`id` field: key index
            'title': 'text',
            'shortDescription': 'text',
            'url': {
                'html5': 'http://',
                'pdf': 'http://',
                'yaml': 'http://',
                'markdown': 'http://',
                'markdown_strict': 'http://'
            },
            'path': {
                'yaml': './dir/file.yaml'
            },
        },
        '9898472': { #`id` field: key index
            'title': 'text',
            'shortDescription': 'text',
            'url': {
                'html5': 'http://',
                'pdf': 'http://',
                'yaml': 'http://',
                'markdown': 'http://',
                'markdown_strict': 'http://'
            },
            'path': {
                'yaml': './dir/file.yaml'
            },
        }
    }
}
summaryDataDemo['type']['1928374']['url']['html5']

import yaml
print (yaml.dump(summaryDataDemo))
```

In [138]:
#delete the output dir tree before regenerating

summaryData = {} #start out with a blank dataset

#generate files
for file in srcFiles:
    if '.ipynb_checkpoints' in file: #Jupyter leaves these all over the place
        #print ('Skipping: ' + file)
        continue #goto the next file
    
    print (file)

    #read the file and convert it to a python data structure
    content = yaml2python (file)
    #print (content)
    
    #################################################
    
    #one dir structure per file
    outputDir = outputConfig['markdown']['dstPath'] + "/" + content['id']
    pandocFile = outputDir + "/" + content['id'] + outputConfig['markdown']['ext']
 
    pandocContents = generatePandocContents (content)
    #print (output)
       
    #save the document to file
    f = safe_open_w(pandocFile) #create the path if needed and open a file for output
    f.write(pandocContents) #write the data to file
    f.close()  #close the file
    
    #do the format conversions
    for outputFormat in outputConfig:
        
        #construct the full destination file/path
        outputFile = outputDir + "/" + content['id'] + outputConfig[outputFormat]['ext']
        #print (outputFile)
        
        convert2format(pandocFile, outputFile, outputFormat)
        
    #################################################
    #save data that will be used for the summary

    if summaryData.get(content['type']) == None:
        summaryData.update({content['type'] : {}})

    if summaryData[content['type']].get(content['id']) == None:
        summaryData[content['type']].update({content['id'] : {}})

    if summaryData[content['type']][content['id']].get('title') == None:
        summaryData[content['type']][content['id']].update({'title' : content['title']})
        
    if summaryData[content['type']][content['id']].get('abstract') == None:
        summaryData[content['type']][content['id']].update({'abstract' : content['abstract']})
        
    if summaryData[content['type']][content['id']].get('url') == None:
        summaryData[content['type']][content['id']].update({'url' : {}})
    
    for outputFormat in outputConfig:
        if outputFormat == 'yaml':
            continue #skip to the next element in the loop
        #this is the relative url for the file
        url = '/Documentation/' + content['id'] + '/' + content['id'] + outputConfig[outputFormat]['ext']
        if summaryData[content['type']][content['id']]['url'].get(outputFormat) == None:
            summaryData[content['type']][content['id']]['url'].update({outputFormat : url})
    
    if summaryData[content['type']][content['id']].get('path') == None:
        summaryData[content['type']][content['id']].update({'path' : {}})
            
    if summaryData[content['type']][content['id']]['path'].get('yaml') == None:
        summaryData[content['type']][content['id']]['path'].update({'yaml' : file})
    
    
#summaryData

./src/32f2e7dc-0d00-4bd9-b39f-9d5b562f8415.yaml
./src/ad9c7149-fa91-4e62-a86b-fdc4387840d1.yaml
./src/1e7c647e-93d7-455d-b5a5-fd7205ca1b14.yaml
./src/cd498dae-9287-4432-91a3-97adf1ea4dd6.yaml
./src/c54c285f-eeb4-4a42-815f-9ea0656265e2.yaml
./src/ebd4a2a1-1c57-4dfb-b13b-c02355a40d74.yaml
./src/4480c89d-1da0-4f18-9f8b-f0238333e69a.yaml
./src/93a0be83-8e5a-4a46-b264-218646b412ce.yaml
./src/320af541-073e-4d76-ab49-d6db5991b48b/320af541-073e-4d76-ab49-d6db5991b48b.yaml
List detected
unknown: # placeholder text here
Structure detected
List detected
unknown: ## Mission
markdown: 320af541-073e-4d76-ab49-d6db5991b48b/ServiceCatalog.md
['./src/320af541-073e-4d76-ab49-d6db5991b48b/ServiceCatalog.md']
unknown: ## Responsibility and procedures
Structure detected
List detected
markdown: /OrgStructure.md
['./src/320af541-073e-4d76-ab49-d6db5991b48b/OrgStructure.md']
markdown: ./Procedure.md
['./src/320af541-073e-4d76-ab49-d6db5991b48b/Procedure.md']
markdown: /Recovery.md
['./src/320af541-073e-4d76-a

### generate summary file

In [139]:
#group by doc type
#title ([YAML] [pandoc] [md] [html] [pdf]): shortDescription
document = ""

document += '---\n'
document += '"title": "Document index"\n'
document += '"lang": "en"\n'
document += '...\n\n'

for el1 in summaryData: #type
    #print (el1)
    document += el1 + '\n\n'
    
    for el2 in summaryData[el1]: #id
        #print ("  " + el2)
        
        document += '* [' + summaryData[el1][el2]['title'] + '](' + summaryData[el1][el2]['url']['html5'] + ') '

        document += '[(Pandoc)](' + summaryData[el1][el2]['url']['markdown'] + ') '
        document += '[(MarkDown)](' + summaryData[el1][el2]['url']['markdown_strict'] + ') '
        
        if summaryData[el1][el2]['abstract'] != None:
            document += summaryData[el1][el2]['abstract']
            
        document += '\n'
        
    document += '\n'

#print (document)

#save the file
mdFile = dstDir + "/index.pandoc"
f = safe_open_w(mdFile) #create the path if needed and open a file for output
f.write(document) #write the data to file
f.close()  #close the file

#convert to html
convert2format(mdFile, dstDir + "/index.html", 'html5')

### Generate list of source files

In [140]:
#group by doc type
#title ([YAML] [pandoc] [md] [html] [pdf]): shortDescription
document = ""

for el1 in summaryData: #type
    #print (el1)
    document += el1 + '\n\n'
    
    for el2 in summaryData[el1]: #id
        #print ("  " + el2)
        
        document += '* [' + summaryData[el1][el2]['title'] + '](' + summaryData[el1][el2]['path']['yaml'] + ') '
        
        if summaryData[el1][el2]['abstract'] != None:
            document += summaryData[el1][el2]['abstract']
            
        document += '\n'
        
    document += '\n'

#print (document)

#save the file
mdFile = "./README.md"
f = safe_open_w(mdFile) #create the path if needed and open a file for output
f.write(document) #write the data to file
f.close()  #close the file