In [1]:
import re
import unicodedata

def normalize_slug(s: str, repl="-"):
    if s is None:
        return ""
    s = unicodedata.normalize("NFKC", s).lower()
    # Replace any character not alnum with hyphen
    s = re.sub(r"[^a-z0-9]+", repl, s)
    # Collapse multiple hyphens
    s = re.sub(rf"{repl}{{2,}}", repl, s)
    # Trim leading/trailing hyphens
    s = s.strip(repl)
    return s or "na"


In [3]:
from transformers import AutoTokenizer

# MiniLM compatible tokenizer
tok = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def chunk_by_tokens(text: str, max_tokens=256, overlap=32):
    ids = tok.encode(text, add_special_tokens=False)
    n = len(ids)
    start = 0
    while start < n:
        end = min(start + max_tokens, n)
        chunk_ids = ids[start:end]
        chunk_text = tok.decode(chunk_ids, skip_special_tokens=True)
        # Find the character start/end by re-encoding chunk_text if needed
        yield start, end, chunk_text
        if end == n:
            break
        start = max(end - overlap, 0)

In [14]:
def pdf_to_chunks_tokenwise(pdf_path, meta, out_path, max_tokens=256, overlap=32):
    import fitz, time, json, hashlib
    from pathlib import Path

    pdf_path = Path(pdf_path)
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

    # SHA256
    h = hashlib.sha256()
    with open(pdf_path, "rb") as f:
        for b in iter(lambda: f.read(1 << 20), b""):
            h.update(b)
    docsha = h.hexdigest()
    meta["docsha256"] = docsha

    doc = fitz.open(str(pdf_path))
    all_chunks = []
    seq = 1

    country_slug = normalize_slug(meta["country"])
    visa_slug = normalize_slug(meta["visa_type"])
    year_slug = normalize_slug(str(meta["year"]))
    doc_slug = normalize_slug(meta["doc_slug"])

    for pagenum in range(len(doc)):
        page = doc[pagenum]
        text = page.get_text("text")
        # Normalize whitespace/control chars to spaces first
        text = unicodedata.normalize("NFKC", text)
        text = re.sub(r"\s+", " ", text).strip()

        for tstart, tend, ctext in chunk_by_tokens(text, max_tokens, overlap):
            docid = f"{country_slug}-{visa_slug}-{year_slug}-{doc_slug}"
            chunkid = f"{docid}-Pg{pagenum+1}-seq{seq:03d}"
            chunkmeta = {
                "chunkid": chunkid,
                "docid": docid,
                "source": meta.get("source"),
                "url": meta.get("url"),
                "country": meta.get("country"),
                "visa_type": meta.get("visatype"),
                "effectivedate": meta.get("effectivedate"),
                "version": meta.get("version"),
                "docsha256": docsha,
                "retrievedat": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
                "page": pagenum + 1,
                "pages": pagenum + 1,
                "sectiontitle": None,
                "language": "en",
                "token_start": int(tstart),
                "token_end": int(tend),
                "text": ctext,
            }
            all_chunks.append(chunkmeta)
            seq += 1

    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        for ch in all_chunks:
            f.write(json.dumps(ch, ensure_ascii=False) + "\n")
    print(f"Saved {len(all_chunks)} chunks to {out_path}")

In [15]:
if __name__ == "__main__":
    pdf_file = r"E:\Info_Srping\swiftvisa\data\raw\UK_Student_Visa.pdf"
    output_file = r"E:\Info_Srping\swiftvisa\data\processed\UK_StudentVisa_chunks_new.jsonl"

    meta_info = {
        "country": "UK",
        "visa_type": "Student and Child Student",
        "year": "2025",
        "doc_slug": "UK Student Visa Guide 2024",
        "source": "Student and Child Student",
        "url": "https://gov.uk/student-visa",
        "effective_date": "2025-07-16",
        "version": "11.0"
    }

    pdf_to_chunks_tokenwise(pdf_file, meta_info, output_file)

Token indices sequence length is longer than the specified maximum sequence length for this model (2899 > 512). Running this sequence through the model will result in indexing errors


Saved 253 chunks to E:\Info_Srping\swiftvisa\data\processed\UK_StudentVisa_chunks_new.jsonl
