In [None]:
#ATTEMPT 1
import os
from pathlib import Path
from pypdf import PdfReader


# ----------- PDF to TEXT Extraction ----------- #
def extract_text(pdf_path, out_folder="extracted"):
    os.makedirs(out_folder, exist_ok=True)

    text = ""
    reader = PdfReader(pdf_path)

    for page in reader.pages:
        text += page.extract_text() + "\n\n"

    out_file = os.path.join(out_folder, Path(pdf_path).stem + ".txt")

    with open(out_file, "w", encoding="utf-8") as f:
        f.write(text)

    print(f"[EXTRACTED] ‚Üí {out_file}")
    return out_file


if __name__ == "__main__":
    pdfs = [
        "C:\\Users\\kriti\\OneDrive\\Desktop\\Infosys\\code\\Dataset\\UK\\Graduate.pdf",
        "C:\\Users\\kriti\\OneDrive\\Desktop\\Infosys\\code\Dataset\\UK\\SkilledWorker.pdf",
        "C:\\Users\\kriti\\OneDrive\\Desktop\\Infosys\\code\Dataset\\UK\\HealthCare.pdf",
        "C:\\Users\\kriti\\OneDrive\\Desktop\\Infosys\\code\Dataset\\UK\\Student.pdf",
        "C:\\Users\\kriti\\OneDrive\\Desktop\\Infosys\\code\\Dataset\\UK\\Visitor.pdf"
    ]

    for pdf in pdfs:
        extract_text(pdf)


  "C:\\Users\\kriti\\OneDrive\\Desktop\\Infosys\\code\Dataset\\UK\\SkilledWorker.pdf",
  "C:\\Users\\kriti\\OneDrive\\Desktop\\Infosys\\code\Dataset\\UK\\HealthCare.pdf",
  "C:\\Users\\kriti\\OneDrive\\Desktop\\Infosys\\code\Dataset\\UK\\Student.pdf",


[EXTRACTED] ‚Üí extracted\Graduate.txt
[EXTRACTED] ‚Üí extracted\SkilledWorker.txt
[EXTRACTED] ‚Üí extracted\HealthCare.txt
[EXTRACTED] ‚Üí extracted\Student.txt
[EXTRACTED] ‚Üí extracted\Visitor.txt


In [None]:
#ATTEMPT 2
import os
import re
import json

# ================== CONFIG ==================

# Where you want the new chunks to be saved
OUTPUT_DIR = r"C:\Users\kriti\OneDrive\Desktop\Infosys\chunks"

# Chunk configuration (you can tune these)
MAX_WORDS = 250        # target max words per chunk
OVERLAP_WORDS = 60     # how many words overlap between chunks


# ================== UTILS ==================

def read_text(path: str) -> str:
    """Read text file with UTF-8."""
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


def simple_sentence_split(text: str):
    """
    Rough sentence splitter based on punctuation.
    Not perfect, but good enough for semantic-ish chunking.
    """
    text = re.sub(r'\s+', ' ', text).strip()
    sentences = re.split(r'(?<=[\.!?])\s+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences


def build_chunks_from_sentences(sentences, max_words=MAX_WORDS, overlap_words=OVERLAP_WORDS):
    """
    Core semantic+size+overlap chunker.
    We accumulate sentences until we reach max_words,
    then create a chunk and start next one with overlap.
    """
    chunks = []
    n = len(sentences)
    start = 0

    while start < n:
        word_count = 0
        end = start

        # Grow chunk until we hit max_words
        while end < n:
            sent_words = len(sentences[end].split())
            if word_count + sent_words > max_words:
                break
            word_count += sent_words
            end += 1

        if end == start:
            # Single very long sentence ‚Äì force include
            end = start + 1

        chunk_sentences = sentences[start:end]
        chunk_text = " ".join(chunk_sentences).strip()
        chunks.append(chunk_text)

        if end >= n:
            break

        # ---- Overlap calculation ----
        overlap_count = 0
        new_start = end

        for i in range(end - 1, start - 1, -1):
            overlap_count += len(sentences[i].split())
            if overlap_count >= overlap_words:
                new_start = i
                break

        start = new_start

    return chunks


def process_one_document(visa_type: str, full_path: str):
    """
    Read TXT directly from full_path, perform semantic+overlap chunking,
    and save as JSON with metadata.
    """
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    text_path = full_path  # we are using absolute path directly
    if not os.path.exists(text_path):
        print(f"‚ö† File not found: {text_path}")
        return

    print(f"\nProcessing {visa_type} ‚Üí {text_path}")

    raw_text = read_text(text_path)
    sentences = simple_sentence_split(raw_text)

    if not sentences:
        print(f"‚ö† No sentences found in {text_path}")
        return

    chunks_text = build_chunks_from_sentences(
        sentences,
        max_words=MAX_WORDS,
        overlap_words=OVERLAP_WORDS
    )

    # Attach metadata
    chunk_objs = []
    for idx, ch in enumerate(chunks_text, start=1):
        words_in_chunk = len(ch.split())
        chunk_objs.append({
            "id": f"{visa_type}_{idx}",
            "visa_type": visa_type,
            "chunk_index": idx,
            "source_file": text_path,
            "word_count": words_in_chunk,
            "max_words": MAX_WORDS,
            "overlap_words": OVERLAP_WORDS,
            "text": ch
        })

    out_path = os.path.join(OUTPUT_DIR, f"{visa_type}.json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(chunk_objs, f, indent=2, ensure_ascii=False)

    print(f"‚úî {visa_type}: {len(chunk_objs)} chunks saved ‚Üí {out_path}")


# ================== MAIN ==================

if __name__ == "__main__":
    # Map visa_type to FULL TXT paths (not PDFs)
    docs = {
        "Graduate":      r"C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\Graduate.txt",
        "SkilledWorker": r"C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\SkilledWorker.txt",
        "HealthCare":    r"C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\HealthCare.txt",
        "Student":       r"C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\Student.txt",
        "Visitor":       r"C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\Visitor.txt"
    }

    for visa_type, path in docs.items():
        process_one_document(visa_type, path)

    print("\n‚úÖ Semantic + size + overlap chunking completed for all docs.")



Processing Graduate ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\Graduate.txt


KeyboardInterrupt: 

In [2]:
#ATTEMPT 3
import os
import re
import json

# ================== CONFIG ==================

# Folder where your .txt files are stored
EXTRACTED_DIR = r"C:\Users\kriti\OneDrive\Desktop\Infosys\extracted"

# Folder where you want chunks to be saved
OUTPUT_DIR = r"C:\Users\kriti\OneDrive\Desktop\Infosys\chunks"

# Chunk configuration (tune if needed)
MAX_WORDS = 250        # target max words per chunk
OVERLAP_WORDS = 80     # approx words to overlap between chunks


# ================== UTILS ==================

def read_text(path: str) -> str:
    """Read text file with UTF-8."""
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


def split_into_sentences(text: str):
    """
    Robust-ish sentence splitter:
    - Normalize spaces
    - Split on . ? !
    - Further split very long segments by ; : and ,
    This makes long legal paragraphs more manageable.
    """
    # normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # rough sentence split on punctuation
    rough = re.split(r"(?<=[\.!?])\s+", text)

    sentences = []
    for seg in rough:
        seg = seg.strip()
        if not seg:
            continue

        # if segment is very long, break on ; or :
        if len(seg.split()) > 120:
            parts = re.split(r"(?<=[;:])\s+", seg)
        else:
            parts = [seg]

        for part in parts:
            part = part.strip()
            if not part:
                continue

            # if still huge, break again by comma
            if len(part.split()) > 120:
                subparts = re.split(r",\s+", part)
                for sp in subparts:
                    sp = sp.strip()
                    if sp:
                        sentences.append(sp)
            else:
                sentences.append(part)

    return sentences


def build_chunks_from_sentences(sentences, max_words=MAX_WORDS, overlap_words=OVERLAP_WORDS):
    """
    Build chunks from a list of sentences:
    - Each chunk up to max_words
    - Overlap approx overlap_words
    - GUARANTEED forward progress (no infinite loops)
    """
    chunks = []
    n = len(sentences)
    idx = 0
    chunk_num = 0

    while idx < n:
        start_idx = idx
        words = 0
        end_idx = idx

        # grow chunk until max_words reached
        while end_idx < n:
            sent_len = len(sentences[end_idx].split())
            if words + sent_len > max_words and words > 0:
                break
            words += sent_len
            end_idx += 1

        # safety: ensure at least one sentence is included
        if end_idx == start_idx:
            end_idx = min(start_idx + 1, n)
            words = len(sentences[start_idx].split())

        # build the chunk text
        chunk_text = " ".join(sentences[start_idx:end_idx]).strip()
        chunks.append(chunk_text)
        chunk_num += 1

        if chunk_num % 10 == 0:
            print(f"   ‚è≥ Chunks created so far: {chunk_num}")

        # if we've reached the end, stop
        if end_idx >= n:
            break

        # ----- compute new start with overlap -----
        overlap_count = 0
        j = end_idx - 1

        # walk backwards from end_idx-1 until we accumulate overlap_words
        while j > start_idx and overlap_count < overlap_words:
            overlap_count += len(sentences[j].split())
            j -= 1

        overlap_start_idx = j + 1  # first sentence to keep for overlap

        # GUARANTEE forward movement
        if overlap_start_idx <= start_idx:
            idx = end_idx   # no useful overlap, just move on
        else:
            idx = overlap_start_idx

    print(f"   ‚úÖ Total chunks built: {len(chunks)}")
    return chunks


def process_one_document(visa_type: str, filename: str):
    """
    Read TXT from EXTRACTED_DIR, perform semantic chunking with overlap,
    save JSON with metadata to OUTPUT_DIR.
    """
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    text_path = os.path.join(EXTRACTED_DIR, filename)
    if not os.path.exists(text_path):
        print(f"‚ö† File not found for {visa_type}: {text_path}")
        return

    print(f"\nProcessing {visa_type} ‚Üí {text_path}")

    raw_text = read_text(text_path)
    sentences = split_into_sentences(raw_text)

    if not sentences:
        print(f"‚ö† No sentences found for {visa_type} in {text_path}")
        return

    chunks_text = build_chunks_from_sentences(sentences)

    # Attach metadata
    chunk_objs = []
    for idx, ch in enumerate(chunks_text, start=1):
        words_in_chunk = len(ch.split())
        chunk_objs.append({
            "id": f"{visa_type}_{idx}",
            "visa_type": visa_type,
            "chunk_index": idx,
            "source_file": filename,
            "word_count": words_in_chunk,
            "max_words": MAX_WORDS,
            "overlap_words": OVERLAP_WORDS,
            "text": ch
        })

    out_path = os.path.join(OUTPUT_DIR, f"{visa_type}.json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(chunk_objs, f, indent=2, ensure_ascii=False)

    print(f"‚úî {visa_type}: {len(chunk_objs)} chunks saved ‚Üí {out_path}")


# ================== MAIN ==================

if __name__ == "__main__":
    # visa_type ‚Üí corresponding TXT filename in EXTRACTED_DIR
    docs = {
        "Graduate":      "Graduate.txt",
        "SkilledWorker": "SkilledWorker.txt",
        "HealthCare":    "HealthCare.txt",
        "Student":       "Student.txt",
        "Visitor":       "Visitor.txt"
    }

    for visa_type, fname in docs.items():
        process_one_document(visa_type, fname)

    print("\n‚úÖ Semantic + size + overlap chunking completed for all docs.")



Processing Graduate ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\Graduate.txt
   ‚úÖ Total chunks built: 7
‚úî Graduate: 7 chunks saved ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\chunks\Graduate.json

Processing SkilledWorker ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\SkilledWorker.txt
   ‚úÖ Total chunks built: 7
‚úî SkilledWorker: 7 chunks saved ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\chunks\SkilledWorker.json

Processing HealthCare ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\HealthCare.txt
   ‚úÖ Total chunks built: 7
‚úî HealthCare: 7 chunks saved ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\chunks\HealthCare.json

Processing Student ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\Student.txt
   ‚úÖ Total chunks built: 6
‚úî Student: 6 chunks saved ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\chunks\Student.json

Processing Visitor ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\Visitor.txt
   ‚úÖ Total chunks built: 9
‚úî Visitor: 9 ch

In [1]:
#ATTEMPT 4 ‚Üí UNIFIED CHUNK JSON

import os
import re
import json

# ================== CONFIG ==================

EXTRACTED_DIR = r"C:\Users\kriti\OneDrive\Desktop\Infosys\extracted"
OUTPUT_DIR = r"C:\Users\kriti\OneDrive\Desktop\Infosys\chunks"

MAX_WORDS = 250
OVERLAP_WORDS = 80


# ================== UTILS ==================

def read_text(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


def split_into_sentences(text: str):
    text = re.sub(r"\s+", " ", text).strip()
    rough = re.split(r"(?<=[\.!?])\s+", text)

    sentences = []
    for seg in rough:
        seg = seg.strip()
        if not seg:
            continue

        if len(seg.split()) > 120:
            parts = re.split(r"(?<=[;:])\s+", seg)
        else:
            parts = [seg]

        for part in parts:
            part = part.strip()
            if not part:
                continue

            if len(part.split()) > 120:
                subparts = re.split(r",\s+", part)
                sentences.extend([sp.strip() for sp in subparts if sp.strip()])
            else:
                sentences.append(part)

    return sentences


def build_chunks_from_sentences(sentences, max_words=MAX_WORDS, overlap_words=OVERLAP_WORDS):
    chunks = []
    n = len(sentences)
    idx = 0
    chunk_num = 0

    while idx < n:
        start_idx = idx
        words = 0
        end_idx = idx

        while end_idx < n:
            sent_len = len(sentences[end_idx].split())
            if words + sent_len > max_words and words > 0:
                break
            words += sent_len
            end_idx += 1

        if end_idx == start_idx:
            end_idx = min(start_idx + 1, n)
            words = len(sentences[start_idx].split())

        chunk_text = " ".join(sentences[start_idx:end_idx]).strip()
        chunks.append(chunk_text)
        chunk_num += 1

        if end_idx >= n:
            break

        overlap_count = 0
        j = end_idx - 1

        while j > start_idx and overlap_count < overlap_words:
            overlap_count += len(sentences[j].split())
            j -= 1

        overlap_start_idx = j + 1

        if overlap_start_idx <= start_idx:
            idx = end_idx
        else:
            idx = overlap_start_idx

    return chunks


# ================== MAIN PROCESSING ==================

def process_all_documents():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    docs = {
        "Graduate":      "Graduate.txt",
        "SkilledWorker": "SkilledWorker.txt",
        "HealthCare":    "HealthCare.txt",
        "Student":       "Student.txt",
        "Visitor":       "Visitor.txt"
    }

    all_chunks = []   # STORE EVERYTHING HERE

    for visa_type, fname in docs.items():
        text_path = os.path.join(EXTRACTED_DIR, fname)

        if not os.path.exists(text_path):
            print(f"‚ö† File not found: {text_path}")
            continue

        print(f"\nProcessing {visa_type} ‚Üí {text_path}")

        raw_text = read_text(text_path)
        sentences = split_into_sentences(raw_text)

        if not sentences:
            print(f"‚ö† No sentences found for {visa_type}")
            continue

        chunks_text = build_chunks_from_sentences(sentences)

        # Add metadata for each chunk
        for idx, ch in enumerate(chunks_text, start=1):
            all_chunks.append({
                "id": f"{visa_type}_{idx}",
                "visa_type": visa_type,
                "chunk_index": idx,
                "source_file": fname,
                "word_count": len(ch.split()),
                "max_words": MAX_WORDS,
                "overlap_words": OVERLAP_WORDS,
                "text": ch
            })

        print(f"‚úî {visa_type}: {len(chunks_text)} chunks created.")

    # SAVE ONE UNIFIED JSON
    out_path = os.path.join(OUTPUT_DIR, "all_visa_chunks.json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(all_chunks, f, indent=2, ensure_ascii=False)

    print(f"\nüéâ ALL DONE! Unified chunk file saved ‚Üí {out_path}")
    print(f"Total chunks: {len(all_chunks)}")


if __name__ == "__main__":
    process_all_documents()



Processing Graduate ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\Graduate.txt
‚úî Graduate: 7 chunks created.

Processing SkilledWorker ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\SkilledWorker.txt
‚úî SkilledWorker: 7 chunks created.

Processing HealthCare ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\HealthCare.txt
‚úî HealthCare: 7 chunks created.

Processing Student ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\Student.txt
‚úî Student: 6 chunks created.

Processing Visitor ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\extracted\Visitor.txt
‚úî Visitor: 9 chunks created.

üéâ ALL DONE! Unified chunk file saved ‚Üí C:\Users\kriti\OneDrive\Desktop\Infosys\chunks\all_visa_chunks.json
Total chunks: 36


In [2]:
import fitz  # PyMuPDF
import json
import re
import difflib
from collections import defaultdict

# ===============================
# CONFIG PATHS
# ===============================

PDF_DIR = r"C:\Users\kriti\OneDrive\Desktop\Infosys\code\Dataset\UK"
CHUNK_JSON = r"C:\Users\kriti\OneDrive\Desktop\Infosys\chunks\all_visa_chunks.json"

PDF_FILES = {
    "Graduate":      "Graduate.pdf",
    "SkilledWorker": "SkilledWorker.pdf",
    "HealthCare":    "HealthCare.pdf",
    "Student":       "Student.pdf",
    "Visitor":       "Visitor.pdf",
}

# ===============================
# PDF TEXT EXTRACTOR
# ===============================

def extract_pdf_text(path):
    doc = fitz.open(path)
    text = ""

    for page in doc:
        text += page.get_text()

    # Normalize spacing
    text = re.sub(r"\s+", " ", text).strip()
    return text


# ===============================
# LOAD CHUNK JSON
# ===============================

with open(CHUNK_JSON, "r", encoding="utf-8") as f:
    chunks = json.load(f)

# Group chunks by visa_type
chunk_map = defaultdict(str)
for ch in chunks:
    chunk_map[ch["visa_type"]] += " " + ch["text"]

print("Loaded unified chunk JSON.")
print(f"Visa types found: {list(chunk_map.keys())}\n")


# ===============================
# COVERAGE COMPARISON FUNCTION
# ===============================

def text_similarity(a, b):
    """Return similarity ratio between two long strings."""
    return difflib.SequenceMatcher(None, a, b).ratio()


def compare_pdf_and_chunks(visa_type, pdf_text, chunk_text):
    print(f"\n===============================")
    print(f"Comparing {visa_type} PDF vs Chunks")
    print(f"===============================\n")

    # Normalize both
    pdf_clean = re.sub(r"\s+", " ", pdf_text).strip().lower()
    chunk_clean = re.sub(r"\s+", " ", chunk_text).strip().lower()

    # Compute similarity
    sim = text_similarity(pdf_clean, chunk_clean)
    print(f"Similarity Score: {sim*100:.2f}%")

    # Coverage: how much PDF text appears in chunks
    pdf_words = pdf_clean.split()
    missing_words = [w for w in pdf_words if w not in chunk_clean]

    miss_ratio = len(missing_words) / len(pdf_words)
    coverage = 1 - miss_ratio

    print(f"Coverage Score: {coverage*100:.2f}%")

    # Show example missing words
    if len(missing_words) > 0:
        print("\nExample missing words (sample of 20):")
        print(missing_words[:20])
    else:
        print("\nPerfect coverage! No words missing.")

    return sim, coverage


# ===============================
# RUN COMPARISON FOR ALL PDFS
# ===============================

results = {}

for visa_type, pdf_name in PDF_FILES.items():
    print(f"\nExtracting PDF text for: {visa_type}")

    pdf_path = f"{PDF_DIR}\\{pdf_name}"
    pdf_text = extract_pdf_text(pdf_path)

    chunk_text = chunk_map[visa_type]

    sim, coverage = compare_pdf_and_chunks(visa_type, pdf_text, chunk_text)

    results[visa_type] = {
        "similarity": sim,
        "coverage": coverage
    }

# ===============================
# FINAL REPORT
# ===============================

print("\n\n===============================")
print("FINAL PDF vs CHUNK COMPARISON REPORT")
print("===============================\n")

for visa, vals in results.items():
    print(f"{visa}:")
    print(f"   Similarity: {vals['similarity']*100:.2f}%")
    print(f"   Coverage:   {vals['coverage']*100:.2f}%\n")

print("üéâ Comparison completed successfully!")


Loaded unified chunk JSON.
Visa types found: ['Graduate', 'SkilledWorker', 'HealthCare', 'Student', 'Visitor']


Extracting PDF text for: Graduate

Comparing Graduate PDF vs Chunks

Similarity Score: 72.35%
Coverage Score: 100.00%

Perfect coverage! No words missing.

Extracting PDF text for: SkilledWorker

Comparing SkilledWorker PDF vs Chunks

Similarity Score: 47.54%
Coverage Score: 100.00%

Perfect coverage! No words missing.

Extracting PDF text for: HealthCare

Comparing HealthCare PDF vs Chunks

Similarity Score: 44.49%
Coverage Score: 100.00%

Perfect coverage! No words missing.

Extracting PDF text for: Student

Comparing Student PDF vs Chunks

Similarity Score: 64.57%
Coverage Score: 100.00%

Perfect coverage! No words missing.

Extracting PDF text for: Visitor

Comparing Visitor PDF vs Chunks

Similarity Score: 32.50%
Coverage Score: 100.00%

Perfect coverage! No words missing.


FINAL PDF vs CHUNK COMPARISON REPORT

Graduate:
   Similarity: 72.35%
   Coverage:   100.00%

Sk

In [3]:
import json
import re
import numpy as np
from collections import defaultdict, Counter

# =======================
# LOAD YOUR UNIFIED JSON
# =======================
PATH = r"C:\Users\kriti\OneDrive\Desktop\Infosys\chunks\all_visa_chunks.json"

with open(PATH, "r", encoding="utf-8") as f:
    chunks = json.load(f)

print(f"Loaded {len(chunks)} chunks.\n")


# =======================
# 1. BASIC CHUNK STATS
# =======================
word_counts = [c["word_count"] for c in chunks]

print("üìå Basic Chunk Statistics")
print("---------------------------")
print(f"Total Chunks: {len(chunks)}")
print(f"Average Chunk Size: {np.mean(word_counts):.2f} words")
print(f"Minimum Chunk Size: {np.min(word_counts)} words")
print(f"Maximum Chunk Size: {np.max(word_counts)} words")
print(f"Std Deviation: {np.std(word_counts):.2f} words\n")


# =======================
# 2. CHUNK SIZE CONSISTENCY SCORE
# =======================
def chunk_balance_score(wc):
    ideal_min, ideal_max = 200, 300
    ok = sum(1 for x in wc if ideal_min <= x <= ideal_max)
    return ok / len(wc)

balance_score = chunk_balance_score(word_counts)

print("üìå Chunk Size Consistency")
print("---------------------------")
print(f"Balance Score (200‚Äì300 words): {balance_score*100:.2f}% chunks well sized\n")


# =======================
# 3. SEMANTIC INTEGRITY SCORE
# =======================
def starts_well(text):
    # good if starts with capital letter, not mid-sentence
    return bool(re.match(r"^[A-Z0-9]", text.strip()))

def ends_well(text):
    return text.strip().endswith(('.', '?', '!'))

start_ok = sum(starts_well(c["text"]) for c in chunks)
end_ok = sum(ends_well(c["text"]) for c in chunks)

semantic_integrity = ((start_ok + end_ok) / (2 * len(chunks)))

print("üìå Semantic Integrity")
print("---------------------------")
print(f"Starts at a natural boundary: {start_ok}/{len(chunks)}")
print(f"Ends at a natural sentence:   {end_ok}/{len(chunks)}")
print(f"Semantic Integrity Score:     {semantic_integrity*100:.2f}%\n")


# =======================
# 4. OVERLAP CONTINUITY SCORE
# =======================
def overlap_ratio(prev, curr):
    prev_words = prev.split()[-40:]   # last 40 words of previous
    curr_words = curr.split()[:40]    # first 40 words of current
    intersect = len(set(prev_words) & set(curr_words))
    return intersect / 40

visa_groups = defaultdict(list)
for c in chunks:
    visa_groups[c["visa_type"]].append(c)

overlap_scores = []
for visa_type, group in visa_groups.items():
    group = sorted(group, key=lambda x: x["chunk_index"])
    for i in range(1, len(group)):
        score = overlap_ratio(group[i-1]["text"], group[i]["text"])
        overlap_scores.append(score)

print("üìå Overlap Continuity")
print("---------------------------")
if overlap_scores:
    print(f"Average Overlap Ratio: {np.mean(overlap_scores)*100:.2f}%")
else:
    print("Not enough chunks to compute overlap.")
print()


# =======================
# 5. VISA-WISE CHUNK DISTRIBUTION
# =======================
visa_count = Counter([c["visa_type"] for c in chunks])

print("üìå Visa-wise Chunk Count")
print("---------------------------")
for visa, count in visa_count.items():
    print(f"{visa}: {count} chunks")
print()


# =======================
# 6. COVERAGE SCORE (APPROX)
# =======================
# Coverage is approximated as: no chunk extremely small + no missing sections
coverage_good = sum(1 for w in word_counts if w > 100)
coverage_score = coverage_good / len(word_counts)

print("üìå Approximate Coverage Score")
print("---------------------------")
print(f"Coverage (chunks > 100 words): {coverage_score*100:.2f}%\n")


# =======================
# 7. ANOMALY DETECTION
# =======================
print("üìå Anomaly Check")
print("---------------------------")
for visa, count in visa_count.items():
    if count < 3:
        print(f"‚ö† Warning: {visa} has very few chunks ‚Üí extraction may be incomplete.")
if np.min(word_counts) < 80:
    print("‚ö† Warning: Some chunks have unusually low word count (<80).")
if balance_score < 0.70:
    print("‚ö† Warning: Many chunks fall outside ideal size range.")
else:
    print("No major anomalies detected üëç")

print("\nüéâ Chunk Evaluation Completed Successfully!")


Loaded 36 chunks.

üìå Basic Chunk Statistics
---------------------------
Total Chunks: 36
Average Chunk Size: 223.42 words
Minimum Chunk Size: 148 words
Maximum Chunk Size: 250 words
Std Deviation: 27.31 words

üìå Chunk Size Consistency
---------------------------
Balance Score (200‚Äì300 words): 86.11% chunks well sized

üìå Semantic Integrity
---------------------------
Starts at a natural boundary: 31/36
Ends at a natural sentence:   25/36
Semantic Integrity Score:     77.78%

üìå Overlap Continuity
---------------------------
Average Overlap Ratio: 20.00%

üìå Visa-wise Chunk Count
---------------------------
Graduate: 7 chunks
SkilledWorker: 7 chunks
HealthCare: 7 chunks
Student: 6 chunks
Visitor: 9 chunks

üìå Approximate Coverage Score
---------------------------
Coverage (chunks > 100 words): 100.00%

üìå Anomaly Check
---------------------------
No major anomalies detected üëç

üéâ Chunk Evaluation Completed Successfully!


In [1]:
import fitz  # PyMuPDF
import json
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# =========================
# CONFIG
# =========================

PDF_DIR = r"C:\Users\kriti\OneDrive\Desktop\Infosys\code\Dataset\UK"
CHUNK_JSON = r"C:\Users\kriti\OneDrive\Desktop\Infosys\chunks\all_visa_chunks.json"

PDF_FILES = {
    "Graduate": "Graduate.pdf",
    "SkilledWorker": "SkilledWorker.pdf",
    "HealthCare": "HealthCare.pdf",
    "Student": "Student.pdf",
    "Visitor": "Visitor.pdf"
}

MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
SIM_THRESHOLD = 0.70   # semantic match threshold

# =========================
# LOAD EMBEDDING MODEL
# =========================

print("Loading embedding model...")
model = SentenceTransformer(MODEL_NAME)
print("Model loaded.\n")

# =========================
# PDF TEXT EXTRACTION
# =========================

def extract_pdf_text(path):
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    text = re.sub(r"\s+", " ", text).strip()
    return text

def split_pdf_semantically(text, max_words=120):
    words = text.split()
    segments = []
    start = 0
    while start < len(words):
        segments.append(" ".join(words[start:start+max_words]))
        start += max_words
    return segments

# =========================
# LOAD CHUNKS
# =========================

with open(CHUNK_JSON, "r", encoding="utf-8") as f:
    chunks = json.load(f)

chunk_map = defaultdict(list)
for ch in chunks:
    chunk_map[ch["visa_type"]].append(ch["text"])

print("Loaded unified chunk JSON.\n")

# =========================
# SEMANTIC COMPARISON
# =========================

def semantic_compare(pdf_segments, chunk_texts):
    pdf_emb = model.encode(pdf_segments, normalize_embeddings=True)
    chunk_emb = model.encode(chunk_texts, normalize_embeddings=True)

    sim_matrix = cosine_similarity(pdf_emb, chunk_emb)

    max_sims = sim_matrix.max(axis=1)

    avg_similarity = np.mean(max_sims)
    coverage = np.mean(max_sims >= SIM_THRESHOLD)

    return avg_similarity, coverage

# =========================
# RUN EVALUATION
# =========================

results = {}

for visa, pdf_name in PDF_FILES.items():
    print(f"Evaluating {visa}...")

    pdf_path = f"{PDF_DIR}\\{pdf_name}"
    pdf_text = extract_pdf_text(pdf_path)

    pdf_segments = split_pdf_semantically(pdf_text)
    chunk_texts = chunk_map[visa]

    avg_sim, coverage = semantic_compare(pdf_segments, chunk_texts)

    results[visa] = {
        "avg_similarity": avg_sim,
        "semantic_coverage": coverage
    }

# =========================
# FINAL REPORT
# =========================

print("\n==============================")
print("SEMANTIC PDF vs CHUNK REPORT")
print("==============================\n")

for visa, vals in results.items():
    print(f"{visa}:")
    print(f"  Semantic Similarity: {vals['avg_similarity']*100:.2f}%")
    print(f"  Semantic Coverage:   {vals['semantic_coverage']*100:.2f}%\n")

print("‚úÖ Semantic evaluation completed successfully.")


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Loading embedding model...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loaded.

Loaded unified chunk JSON.

Evaluating Graduate...
Evaluating SkilledWorker...
Evaluating HealthCare...
Evaluating Student...
Evaluating Visitor...

SEMANTIC PDF vs CHUNK REPORT

Graduate:
  Semantic Similarity: 88.09%
  Semantic Coverage:   100.00%

SkilledWorker:
  Semantic Similarity: 87.85%
  Semantic Coverage:   100.00%

HealthCare:
  Semantic Similarity: 87.87%
  Semantic Coverage:   100.00%

Student:
  Semantic Similarity: 85.09%
  Semantic Coverage:   100.00%

Visitor:
  Semantic Similarity: 85.10%
  Semantic Coverage:   100.00%

‚úÖ Semantic evaluation completed successfully.
