### Constants

In [1]:
# content directory
# content\{idx}_{filename}.html

# images directory
# images\{idx}_{filename}.jpg

# meta data 
# DMAS_TOC_MAP.json 

# iterate recursively through TOC MAP 
# for each item, 

# should connect document to other documents (later, also its images)
# dict['html_filename', 'url', 'children':[page_nums], 'images:':[]]

# document id used in TRC
DOC_ID = 'ezXh6dvgLV1nzevBm9aHDA'

### Building Page Meta (without images part)

In [4]:
import os 
flattened_document_meta = []
def extract_document_info(document_toc_array:list[dict]):
    # url_ = f'https://trc-techresource.mastercard.com/r/{DOC_ID}/{content_info['tocId']}'
    document_idx_array = []

    for document_meta in document_toc_array:
        document_idx = len(flattened_document_meta)
        flattened_document_meta.append(
            {
                'page_idx' : document_idx,
                'title': document_meta['title'],
                'filename' : os.path.basename(document_meta['prettyUrl']),
                'url': f'https://trc-techresource.mastercard.com/r/{DOC_ID}/{document_meta["tocId"]}',
                'children': extract_document_info(document_meta['children'])
            }
        )
        document_idx_array.append(document_idx)
    return document_idx_array

In [5]:
import json
with open('DMAS_TOC_MAP.json') as f:
    toc_meta = json.load(f)
    
flattened_document_meta = []
_ = extract_document_info(toc_meta)

In [6]:
flattened_document_meta

[{'page_idx': 0,
  'title': 'Mastercard Network Processing Dual Message Authorization System Guide',
  'filename': 'fql1729791739642.html',
  'url': 'https://trc-techresource.mastercard.com/r/ezXh6dvgLV1nzevBm9aHDA/3zTtbwzpjdpdCQGa_JAHNg',
  'children': []},
 {'page_idx': 1,
  'title': 'How to use this guide',
  'filename': 'kwn1724863424441.html',
  'url': 'https://trc-techresource.mastercard.com/r/ezXh6dvgLV1nzevBm9aHDA/V~ItyuIWcR_n93UdP2RTjQ',
  'children': []},
 {'page_idx': 2,
  'title': 'Introduction to Mastercard switches for processing',
  'filename': 'nze1724869144978.html',
  'url': 'https://trc-techresource.mastercard.com/r/ezXh6dvgLV1nzevBm9aHDA/JFsYQABhOjdzkRTN12wEww',
  'children': []},
 {'page_idx': 3,
  'title': 'Introduction to Mastercard Dual Message Authorization System processing',
  'filename': 'iki1724869178217.html',
  'url': 'https://trc-techresource.mastercard.com/r/ezXh6dvgLV1nzevBm9aHDA/YJX2Je9ED2GFq2YOqlC5xg',
  'children': []},
 {'page_idx': 4,
  'title': '

### HyperLinks
#### Images

In [7]:
# dict of image hyperlinks 
# in the query text, just write filename.jpg 
# image will be attached to chunks at the query time 

url2img_map = {}
import json 
with open('DMAS_IMG_MAP.json') as f:
    img_map = json.load(f)

for idx, img_info in enumerate(img_map):
    id_ = img_info['id']
    # filename = img_info['filename']
    src_url = f'https://trc-techresource.mastercard.com/api/khub/maps/{DOC_ID}/resources/{id_}/content'
    view_url = f'https://trc-techresource.mastercard.com/viewer/attachment/{DOC_ID}/{id_}'
    url2img_map.update(
        {
            src_url: idx,
            view_url: idx  
        }
    )

#### Text

In [8]:
# map hyperlink to page_id, which is position itself

page_urls = {page_meta['url']:idx for idx, page_meta in enumerate(flattened_document_meta)}

In [9]:
import re
def get_urls(chunk_content:str):
    urls = re.findall(r'https://[^)\s]*', chunk_content)
    images_idx = set()
    page_links = set()

    # images
    for url in urls:
        if url in url2img_map:
            img_idx = url2img_map[url]
            chunk_content = chunk_content.replace(url, f'$image_{images_idx}$')
            images_idx.add(img_idx)
            
        if url in page_urls:
            page_idx = page_urls[url]
            page_links.add(page_idx)
            chunk_content = chunk_content.replace(url, f'$page_{page_idx}$')
    return chunk_content, list(page_links), list(images_idx)

### Chunking

In [74]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
CHUNK_SIZE = 4000
CHUNK_OVERLAP = 0.2*CHUNK_SIZE

def split_page_by_size(page_text:str):
    headings = []
    for line in page_text.splitlines():
        if line.startswith('#'):
            headings.append(line)
        else:
            break
    if len(page_text) > CHUNK_SIZE:
        recursive_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE ,   # keep small here for demo, usually 500–1000
            chunk_overlap=CHUNK_OVERLAP
        )
        final_chunks = []
        first_chunk = True
        for sub in recursive_splitter.split_text(page_text):
            if first_chunk:
                first_chunk = False 
            else:
                sub = '\n'.join(headings + [sub])
                
            final_chunks.append(
                sub
            )
    else:
        final_chunks = [page_text]
    return final_chunks

In [75]:
# chunking 
# split by headings if possible else do overlap chunking 
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
CHUNK_SIZE = 4000
CHUNK_OVERLAP = 0.2*CHUNK_SIZE

def split_page_by_headers(page_text:str):
    headers_to_split_on = [
        ("#", "#"),
        ("##", "##"),
        # ("###", "Header 3"),
    ]

    md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
    md_chunks = md_splitter.split_text(page_text)

    # Step 2: Further split large sections with RecursiveCharacterTextSplitter
    recursive_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE ,   # keep small here for demo, usually 500–1000
        chunk_overlap=CHUNK_OVERLAP
    )

    final_chunks = []
    for chunk in md_chunks:
        header_meta = (f'# {chunk.metadata.get('#')}' if chunk.metadata.get('#') else ''), (f'## {chunk.metadata.get('##')}' if chunk.metadata.get('##') else '') 
        title_str = '\n'.join([t for t in header_meta if t.strip()])

        if len(chunk.page_content) > CHUNK_SIZE:
            # re-split this big chunk
            sub_chunks = recursive_splitter.split_text(chunk.page_content)
            # each chunk should contain sub heading?
            first_chunk = True
            for sub in sub_chunks:
                if first_chunk:
                    first_chunk = False 
                else:
                    sub = '\n'.join([title_str, sub]) if title_str.strip() else sub
                final_chunks.append(
                    sub
                )
        else:
            final_chunks.append(
                chunk.page_content
            )

    return final_chunks

In [None]:
from markdownify import markdownify
import re 
import os 
os.system('del /f /q chunks\\*.md')

from tqdm import tqdm 
for page_meta in tqdm(flattened_document_meta, ncols=100):
    with open('content\\' + page_meta['filename'], mode='r', encoding='utf-8', errors='surrogatepass') as f:
        file_content = f.read()
        markdown_content = markdownify(file_content, heading_style='ATX')
        # remove query params to url
        markdown_content = re.sub(r'(https://[^)\s])+\?[^)\s]+', r'\1', markdown_content)
        
        lines_without_headers = [l for l in markdown_content.splitlines() if not l.startswith('#')]
        if not lines_without_headers:
            continue

        prefix = f'# {page_meta['title']}'
        page_chunks = []
        for chunk_idx, chunk_content in enumerate(split_page_by_size(markdown_content)):

            lines_without_headers = [l for l in chunk_content.splitlines() if not l.startswith('#')]
            if not lines_without_headers:
                continue
            # find hyperlinks 
            chunk_content, page_links, images_idx = get_urls(chunk_content)
            # add title to each chunk
            title = prefix + f' chunk_{chunk_idx}'
            
            chunk_content = (prefix + '\n' + chunk_content)
            chunk_filename = f'page_{page_meta['page_idx']}_chunk_{chunk_idx}.md'
            with open('chunks\\' + chunk_filename, mode='w', encoding='utf-8', errors='surrogatepass') as cf:
                cf.write(chunk_content)

            page_chunks.append(
                {
                    'chunk_data_filename': chunk_filename,
                    'page_links': page_links,
                    'images_idx': images_idx
                }
            )
        
        # how to go from page links to chunks ?
        page_meta['chunks'] = page_chunks
        # break

100%|███████████████████████████████████████████████████████████| 1981/1981 [01:44<00:00, 18.97it/s]


In [80]:
with open('DMAS_CHUNK_META.json', 'w') as f:
    json.dump(flattened_document_meta, f, indent=4)

In [42]:
import pandas as pd
def measure(file):
    with open(file) as f:
        df = pd.DataFrame([l.strip().split() for l in f.readlines()])
    df[0] = df.iloc[:, 0].astype(int, copy=True)
    df.columns = ['size', 'file']
    return df 

In [43]:
measure('lens.txt').describe()

Unnamed: 0,size
count,3389.0
mean,147.429035
std,178.426239
min,5.0
25%,31.0
50%,84.0
75%,176.0
max,1207.0


In [51]:
measure('lens_new.txt').describe()

Unnamed: 0,size
count,3014.0
mean,165.6858
std,184.232041
min,5.0
25%,49.0
50%,97.0
75%,195.0
max,1234.0
