In [22]:

# document id used in TRC
DOC_ID = 'ezXh6dvgLV1nzevBm9aHDA'

import os 
flattened_document_meta = []
def extract_document_info(document_toc_array:list[dict], sec_prefix=[]):
    # url_ = f'https://trc-techresource.mastercard.com/r/{DOC_ID}/{content_info['tocId']}'
    document_idx_array = []

    for sub_idx, document_meta in enumerate(document_toc_array):
        document_idx = len(flattened_document_meta)
        flattened_document_meta.append(
            {
                'title': document_meta['title'],
                'section': sec_prefix + [sub_idx],
                'filename' : os.path.basename(document_meta['prettyUrl']),
                'url': f'https://trc-techresource.mastercard.com/r/{DOC_ID}/{document_meta["tocId"]}',
                'children': extract_document_info(document_meta['children'], sec_prefix + [sub_idx])
            }
        )
        document_idx_array.append(document_idx)
    return document_idx_array

import json
with open('DMAS_TOC_MAP.json') as f:
    toc_meta = json.load(f)
    
flattened_document_meta = []
_ = extract_document_info(toc_meta)

In [23]:
url2page_map = {page_meta['url']:idx for idx, page_meta in enumerate(flattened_document_meta)}

url2img_map = {}
import json 
with open('DMAS_IMG_MAP.json') as f:
    img_map = json.load(f)

for idx, img_info in enumerate(img_map):
    id_ = img_info['id']
    # filename = img_info['filename']
    src_url = f'https://trc-techresource.mastercard.com/api/khub/maps/{DOC_ID}/resources/{id_}/content'
    view_url = f'https://trc-techresource.mastercard.com/viewer/attachment/{DOC_ID}/{id_}'
    url2img_map.update(
        {
            src_url: idx,
            view_url: idx  
        }
    )

import re
def get_urls(chunk_content:str):
    urls = re.findall(r'https://[^)\s]*', chunk_content)
    images_idx = set()
    page_links = set()

    # images
    for url in urls:
        if url in url2img_map:
            img_idx = url2img_map[url]
            chunk_content = chunk_content.replace(url, f'$image_{images_idx}$')
            images_idx.add(img_idx)
            
        if url in url2page_map:
            page_idx = url2page_map[url]
            page_links.add(page_idx)
            chunk_content = chunk_content.replace(url, f'$page_{page_idx}$')
    return chunk_content

In [24]:
# convert to markdowns
from markdownify import markdownify
from tqdm import tqdm
OVERRIDE = False
for page_meta in tqdm(flattened_document_meta, ncols=100):
    with open('html_content/' + page_meta['filename'], mode='r', encoding='utf-8', errors='surrogatepass') as f:
        file_content = f.read()
        markdown_content = f'# {page_meta["title"]}\n'
        markdown_content += markdownify(file_content, heading_style='ATX')
        # clean urls 
        markdown_content = re.sub(r'(https://[^)\s])+\?[^)\s]+', r'\1', markdown_content)
        # replace urls
        markdown_content = get_urls(markdown_content)
        # count tokens
        page_meta['tokens'] = len(markdown_content.split())
        page_meta['non_header_lines'] = len([line for line in markdown_content.splitlines() if not line.startswith('#')])
        page_meta['md'] = page_meta['filename']+'.md'
        # dump
        outfile_path = 'md_content/' + page_meta['filename']+'.md'
        if (not os.path.exists(outfile_path)) or OVERRIDE:
            with open(outfile_path, mode='w', encoding='utf-8', errors='surrogatepass') as md_f:
                md_f.write(markdown_content)

100%|██████████████████████████████████████████████████████████| 1981/1981 [00:15<00:00, 127.02it/s]


In [25]:
select_keys = ['section', 'title', 'md', 'tokens', 'non_header_lines', 'children']

with open('DMAS_FLAT_PAGE_META.json', 'w') as f:
    json.dump(
        [{key: page_meta[key] for key in select_keys} for page_meta in flattened_document_meta], f, indent=4
    )