In [1]:
import pdfplumber
import uuid

def parse_pdf(file_path):
    chunks = []
    with pdfplumber.open(file_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if text:
                paragraphs = text.split('\n\n')  # split by double newline = paragraph
                for para in paragraphs:
                    clean_para = para.strip()
                    if clean_para:
                        chunks.append({
                            "chunk_id": str(uuid.uuid4()),
                            "chunk_text": clean_para,
                            "source_doc": file_path,
                            "page_no": page_number,
                            "clause_id": f"{page_number}-{paragraphs.index(para)}"
                        })
    return chunks


In [2]:
from docx import Document

def parse_docx(file_path):
    doc = Document(file_path)
    chunks = []

    for i, para in enumerate(doc.paragraphs):
        clean_text = para.text.strip()
        if clean_text:
            chunks.append({
                "chunk_id": str(uuid.uuid4()),
                "chunk_text": clean_text,
                "source_doc": file_path,
                "page_no": None,  # Word files don't have page numbers by default
                "clause_id": f"para-{i}"
            })
    return chunks


ModuleNotFoundError: No module named 'docx'

In [3]:
!pip install python-docx

Collecting python-docx
  Obtaining dependency information for python-docx from https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl.metadata
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting lxml>=3.1.0 (from python-docx)
  Obtaining dependency information for lxml>=3.1.0 from https://files.pythonhosted.org/packages/0b/1e/cc32034b40ad6af80b6fd9b66301fc0f180f300002e5c3eb5a6110a93317/lxml-6.0.0-cp312-cp312-win_amd64.whl.metadata
  Downloading lxml-6.0.0-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
   ---------------------------------------- 0.0/253.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/253.0 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/253.0 kB 660.6 kB/s eta 0:00:01
   ---- ---------------------------------- 30.7/253.0 kB 660.6 kB/s eta 0:00:01
   ---- -----------------


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from docx import Document

In [5]:
from docx import Document

def parse_docx(file_path):
    doc = Document(file_path)
    chunks = []

    for i, para in enumerate(doc.paragraphs):
        clean_text = para.text.strip()
        if clean_text:
            chunks.append({
                "chunk_id": str(uuid.uuid4()),
                "chunk_text": clean_text,
                "source_doc": file_path,
                "page_no": None,  # Word files don't have page numbers by default
                "clause_id": f"para-{i}"
            })
    return chunks


In [6]:
import email

def parse_email(file_path):
    chunks = []
    with open(file_path, 'r') as f:
        msg = email.message_from_file(f)
        body = ""

        if msg.is_multipart():
            for part in msg.walk():
                content_type = part.get_content_type()
                if content_type == 'text/plain':
                    body += part.get_payload(decode=True).decode(errors='ignore')
        else:
            body = msg.get_payload(decode=True).decode(errors='ignore')

        paragraphs = body.split('\n\n')
        for i, para in enumerate(paragraphs):
            clean_text = para.strip()
            if clean_text:
                chunks.append({
                    "chunk_id": str(uuid.uuid4()),
                    "chunk_text": clean_text,
                    "source_doc": file_path,
                    "page_no": None,
                    "clause_id": f"email-{i}"
                })

    return chunks


In [7]:
def parse_document(file_path):
    if file_path.endswith('.pdf'):
        return parse_pdf(file_path)
    elif file_path.endswith('.docx'):
        return parse_docx(file_path)
    elif file_path.endswith('.eml'):
        return parse_email(file_path)
    else:
        raise ValueError("Unsupported file type")


In [8]:
import os
import json

def save_chunks_to_json(chunks, doc_name, out_dir="../chunks"):
    # Ensure the output directory exists
    os.makedirs(out_dir, exist_ok=True)
    
    # Define output file path
    out_path = os.path.join(out_dir, f"{doc_name}_chunks.json")
    
    # Write chunks to JSON
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(chunks, f, indent=2, ensure_ascii=False)

    print(f"✅ Saved {len(chunks)} chunks to {out_path}")


In [9]:
import os
from pathlib import Path

# Adjust path to your data folder relative to the notebook
data_folder = Path("../data")  # or "./data" if you're running notebook from project root
files = list(data_folder.glob("*"))  # get all files

print(f"📄 Found {len(files)} files in {data_folder}")

for file_path in files:
    print(f"\n🔍 Parsing: {file_path.name}")
    
    try:
        chunks = parse_document(str(file_path))  # parse based on file type
        doc_name = file_path.stem  # get filename without extension
        save_chunks_to_json(chunks, doc_name=doc_name)
    except Exception as e:
        print(f"❌ Failed to parse {file_path.name}: {e}")


📄 Found 5 files in ..\data

🔍 Parsing: dataset1.pdf
✅ Saved 49 chunks to ../chunks\dataset1_chunks.json

🔍 Parsing: dataset2.pdf
✅ Saved 101 chunks to ../chunks\dataset2_chunks.json

🔍 Parsing: dataset3.pdf
✅ Saved 2 chunks to ../chunks\dataset3_chunks.json

🔍 Parsing: dataset4.pdf
✅ Saved 39 chunks to ../chunks\dataset4_chunks.json

🔍 Parsing: dataset5.pdf
✅ Saved 31 chunks to ../chunks\dataset5_chunks.json
