In [1]:
import os, re, json, unicodedata
from collections import defaultdict, OrderedDict
import pandas as pd

In [3]:
INPUT_FOLDER = "files"
MAPPING_PATH = "src/role_mapping.json"   
OUT_CHUNKS = "chunks.json"
OUT_ROLE_MAP = "role_to_chunks.json"
CHUNK_SIZE = 512
CHUNK_OVERLAP = 0 

In [5]:
def read_file_raw(path):
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        return f.read()

def read_csv(path):
    df = pd.read_csv(path, dtype=str).fillna("")
    parts = []
    header = " | ".join(df.columns.tolist())
    parts.append(header)
    for _, row in df.iterrows():
        parts.append(" | ".join(row.astype(str).tolist()))
    return "\n".join(parts)

In [6]:
HEADING_RE = re.compile(r'^(#{1,6})\s*(.+)$', flags=re.MULTILINE)

In [7]:
def extract_title_and_sections(md_text, filename):
    text = md_text.replace("\r\n", "\n").replace("\r", "\n")
    matches = list(HEADING_RE.finditer(text))
    if not matches:
        title = os.path.splitext(os.path.basename(filename))[0]
        return title, [{"heading": None, "content": text.strip()}]
    title = None
    if matches:
        for m in matches:
            if len(m.group(1)) == 1:   
                title = m.group(2).strip()
                break
    if not title:
        title = os.path.splitext(os.path.basename(filename))[0]
    sections = []
    for i, m in enumerate(matches):
        heading_text = m.group(2).strip()
        start = m.end()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        body = text[start:end].strip()
        sections.append({"heading": heading_text, "content": body})
    return title, sections

In [8]:
def clean_text(text):
    if text is None:
        return ""
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]{2,}', ' ', text)
    text = "\n".join(line.strip() for line in text.splitlines())
    text = re.sub(r' {2,}', ' ', text).strip()
    return text

In [10]:
def tokenize(text):
    if not text:
        return []
    return text.split()

def detokenize(tokens):
    return " ".join(tokens)

In [11]:
def chunk_tokens(tokens, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    if not tokens:
        return []
    chunks = []
    step = chunk_size - overlap if (chunk_size - overlap) > 0 else chunk_size
    for start in range(0, len(tokens), step):
        end = start + chunk_size
        chunk = tokens[start:end]
        if not chunk:
            break
        chunks.append(chunk)
        if end >= len(tokens):
            break
    return chunks

In [12]:
def load_role_map(path=MAPPING_PATH):
    with open(path, "r", encoding="utf-8") as f:
        tag_map = json.load(f, object_pairs_hook=OrderedDict)
    norm = OrderedDict()
    for role, kws in tag_map.items():
        norm[role] = [k.lower() for k in kws if isinstance(k, str)]
    return norm

In [13]:
def roles_for_folder(folder_name, role_map):
    folder = (folder_name or "").lower()
    roles = []
    for role, kws in role_map.items():
        for k in kws:
            if k == folder:
                roles.append(role)
                break
    if "C-Level" in role_map:
        roles.append("C-Level")
    seen = set()
    out = []
    for r in roles:
        if r not in seen:
            out.append(r); seen.add(r)
    return out

In [15]:
def build_chunks(input_folder=INPUT_FOLDER, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    role_map = load_role_map()
    all_chunks = []
    role_to_chunkids = defaultdict(list)

    doc_id = 0
    for root, _, files in os.walk(input_folder):
        for fname in sorted(files):
            path = os.path.join(root, fname)
            ext = os.path.splitext(fname)[1].lower()
            try:
                if ext in [".md", ".markdown"]:
                    raw = read_file_raw(path)
                    title, sections = extract_title_and_sections(raw, fname)
                elif ext == ".csv":
                    raw = read_csv(path)
                    title = os.path.splitext(fname)[0]
                    sections = [{"heading": None, "content": raw}]
                elif ext in [".txt"]:
                    raw = read_file_raw(path)
                    title = os.path.splitext(fname)[0]
                    sections = [{"heading": None, "content": raw}]
                else:
                    continue
            except Exception as e:
                print(f"[warning] failed to read {path}: {e}")
                continue

            department = os.path.basename(root).lower()
            accessible_roles = roles_for_folder(department, role_map)

            for s_idx, sec in enumerate(sections):
                sec_heading = sec.get("heading")
                sec_text = sec.get("content", "")
                combined_text = " ".join([title or "", sec_heading or "", sec_text or ""]).strip()
                cleaned = clean_text(combined_text)
                tokens = tokenize(cleaned)
                token_chunks = chunk_tokens(tokens, chunk_size=chunk_size, overlap=overlap)

                for c_idx, tk in enumerate(token_chunks):
                    chunk_id = f"doc{doc_id}_sec{s_idx}_chunk{c_idx}"
                    chunk_text = detokenize(tk)
                    meta = {
                        "chunk_id": chunk_id,
                        "source": path,
                        "filename": fname,
                        "doc_id": doc_id,
                        "section_index": s_idx,
                        "section_heading": sec_heading,
                        "title": title,
                        "department": department,
                        "roles": accessible_roles,
                        "token_count": len(tk)
                    }
                    all_chunks.append({"id": chunk_id, "text": chunk_text, "meta": meta})
                    for r in accessible_roles:
                        role_to_chunkids[r].append(chunk_id)

            doc_id += 1

    return all_chunks, dict(role_to_chunkids)

In [16]:
if __name__ == "__main__":
    chunks, role_map_out = build_chunks()
    with open(OUT_CHUNKS, "w", encoding="utf-8") as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)
    with open(OUT_ROLE_MAP, "w", encoding="utf-8") as f:
        json.dump(role_map_out, f, ensure_ascii=False, indent=2)