In [2]:
import os
from pathlib import Path
import sys
import fitz  # PyMuPDF
import pandas as pd

print("Python:", sys.version)
print("CWD (current working directory):", os.getcwd())


Python: 3.12.4 (tags/v3.12.4:8e8a4ba, Jun  6 2024, 19:30:16) [MSC v.1940 64 bit (AMD64)]
CWD (current working directory): c:\Users\SEC\OneDrive\Desktop\docinsight\notebooks


In [3]:
from pathlib import Path

cwd = Path().resolve()
print("Resolved cwd:", cwd)

# First assume cwd is the project root (DOCINSIGHT)
pdf_path = cwd / "data" / "raw" / "tables-charts.pdf"

# If that doesn't exist, assume cwd is notebooks/ and go one level up
if not pdf_path.exists():
    pdf_path = cwd.parent / "data" / "raw" / "tables-charts.pdf"

print("PDF path:", pdf_path)
print("Exists:", pdf_path.exists())


Resolved cwd: C:\Users\SEC\OneDrive\Desktop\docinsight\notebooks
PDF path: C:\Users\SEC\OneDrive\Desktop\docinsight\data\raw\tables-charts.pdf
Exists: True


In [4]:
doc = fitz.open(pdf_path)
print("Number of pages:", len(doc))

blocks_data = []

for page_index in range(len(doc)):
    page = doc[page_index]
    
    # get_text("blocks") → list of (x0, y0, x1, y1, text, block_no, block_type, ...)
    raw_blocks = page.get_text("blocks")
    
    for b_idx, block in enumerate(raw_blocks):
        x0, y0, x1, y1, text, *rest = block
        
        if text is None:
            continue
        
        cleaned = text.strip()
        if not cleaned:
            continue
        
        blocks_data.append({
            "page_number": page_index,
            "block_index": b_idx,
            "text": cleaned,
            "x0": x0,
            "y0": y0,
            "x1": x1,
            "y1": y1,
        })

doc.close()

len(blocks_data)


Number of pages: 14


498

In [5]:
df_blocks = pd.DataFrame(blocks_data)
df_blocks.head(10)


Unnamed: 0,page_number,block_index,text,x0,y0,x1,y1
0,0,0,75,542.809998,779.980042,552.77002,791.320007
1,0,1,"Tables, Charts and Graphs\nUNIT 5\nTABLES, CHA...",42.529999,51.722008,552.762024,73.583885
2,0,2,Structure,42.529999,89.59803,88.175613,103.206032
3,0,3,5.0\nObjectives,42.529999,112.780014,114.294998,124.120018
4,0,4,5.1\nIntroduction,42.529999,130.420029,121.851982,141.760025
5,0,5,"5.2\nThe Function of Tables, Charts and Graphs",42.529999,148.060043,244.85202,159.40004
6,0,6,5.3\nTables,42.529999,165.700058,97.014008,177.040054
7,0,7,5.4\nCharts and Graphs,42.529999,183.340073,145.844009,194.680069
8,0,8,5.4.1\nLine Graphs,70.849998,198.290985,147.768524,208.496994
9,0,9,5.4.2\nBar Charts,70.849991,211.970978,141.772705,222.176987


In [6]:
# How many pages and blocks do we have?
print("Total blocks:", len(df_blocks))
print("Pages:", df_blocks['page_number'].unique())


Total blocks: 498
Pages: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]


In [7]:
# Look at the blocks of a specific page (e.g., page 0)
page_to_view = 0
df_blocks[df_blocks["page_number"] == page_to_view].sort_values(["y0", "x0"]).head(20)


Unnamed: 0,page_number,block_index,text,x0,y0,x1,y1
1,0,1,"Tables, Charts and Graphs\nUNIT 5\nTABLES, CHA...",42.529999,51.722008,552.762024,73.583885
2,0,2,Structure,42.529999,89.59803,88.175613,103.206032
3,0,3,5.0\nObjectives,42.529999,112.780014,114.294998,124.120018
4,0,4,5.1\nIntroduction,42.529999,130.420029,121.851982,141.760025
5,0,5,"5.2\nThe Function of Tables, Charts and Graphs",42.529999,148.060043,244.85202,159.40004
6,0,6,5.3\nTables,42.529999,165.700058,97.014008,177.040054
7,0,7,5.4\nCharts and Graphs,42.529999,183.340073,145.844009,194.680069
8,0,8,5.4.1\nLine Graphs,70.849998,198.290985,147.768524,208.496994
9,0,9,5.4.2\nBar Charts,70.849991,211.970978,141.772705,222.176987
10,0,10,5.4.3\nFlow-charts,70.849991,225.65097,146.330292,235.856979


In [8]:
from typing import List, Dict

def create_text_chunks(
    df_blocks: pd.DataFrame,
    max_chars: int = 600,
    join_with: str = "\n"
) -> pd.DataFrame:
    """
    Merge nearby text blocks on each page into larger chunks of up to max_chars.
    Returns a DataFrame with one row per chunk.
    """
    chunks: List[Dict] = []
    chunk_id = 0

    # Make sure blocks are sorted in reading order (top to bottom, left to right)
    df_sorted = df_blocks.sort_values(["page_number", "y0", "x0"]).reset_index(drop=True)

    for page_number, df_page in df_sorted.groupby("page_number"):
        current_text = ""
        start_block_idx = None
        last_block_idx = None

        for _, row in df_page.iterrows():
            block_text = row["text"]
            b_idx = row["block_index"]

            # If starting a new chunk
            if start_block_idx is None:
                start_block_idx = b_idx
                last_block_idx = b_idx
                current_text = block_text
                continue

            # If adding this block exceeds max_chars, save current chunk and start new
            if len(current_text) + 1 + len(block_text) > max_chars:
                chunks.append({
                    "chunk_id": chunk_id,
                    "page_number": page_number,
                    "start_block_index": start_block_idx,
                    "end_block_index": last_block_idx,
                    "text": current_text.strip(),
                    "char_len": len(current_text.strip()),
                })
                chunk_id += 1

                # start a new chunk with this block
                start_block_idx = b_idx
                last_block_idx = b_idx
                current_text = block_text
            else:
                # continue the same chunk
                current_text = current_text + join_with + block_text
                last_block_idx = b_idx

        # Flush last chunk on the page
        if start_block_idx is not None and current_text.strip():
            chunks.append({
                "chunk_id": chunk_id,
                "page_number": page_number,
                "start_block_index": start_block_idx,
                "end_block_index": last_block_idx,
                "text": current_text.strip(),
                "char_len": len(current_text.strip()),
            })
            chunk_id += 1

    df_chunks = pd.DataFrame(chunks)
    return df_chunks


In [9]:
df_chunks = create_text_chunks(df_blocks, max_chars=600)
print("Number of chunks:", len(df_chunks))
df_chunks.head(10)


Number of chunks: 65


Unnamed: 0,chunk_id,page_number,start_block_index,end_block_index,text,char_len
0,0,0,1,18,"Tables, Charts and Graphs\nUNIT 5\nTABLES, CHA...",594
1,1,0,19,22,"5.1\nINTRODUCTION\nWe have, in the various uni...",434
2,2,0,23,23,"a)\nWhen you decide to use tables, charts and ...",610
3,3,0,24,0,b)\nLet us now try and understand the function...,399
4,4,1,1,3,Writing Skills\n\nThese devices enable you to...,246
5,5,1,4,4,"A table is a collection of figures, facts or o...",403
6,6,1,5,8,\nThey can show a number of specific data is ...,592
7,7,1,9,22,Main Types of Viral Hepatitis\nViral hepatitis...,379
8,8,1,16,12,Virus is present in the faeces\nof infected pe...,595
9,9,1,24,26,"Male homosexuals, people\nwith multiple sexual...",552


In [10]:
chunk_to_view = 0

row = df_chunks.iloc[chunk_to_view]
print(f"Chunk ID: {row['chunk_id']}")
print(f"Page: {row['page_number']}")
print(f"Blocks: {row['start_block_index']} → {row['end_block_index']}")
print("--------- TEXT ---------")
print(row["text"])


Chunk ID: 0
Page: 0
Blocks: 1 → 18
--------- TEXT ---------
Tables, Charts and Graphs
UNIT 5
TABLES, CHARTS AND GRAPHS
Structure
5.0
Objectives
5.1
Introduction
5.2
The Function of Tables, Charts and Graphs
5.3
Tables
5.4
Charts and Graphs
5.4.1
Line Graphs
5.4.2
Bar Charts
5.4.3
Flow-charts
5.5
Let Us Sum Up
5.6
Key Words
5.7
Answers to Check Your Progress
5.0
OBJECTIVES
After going through this unit, you will be able to:

describe the function of tables, charts, and graphs;

interpret and analyse the data presented in the form of tables, charts and graphs; and

apply the knowledge of these devices to communicate information more effectively.


In [11]:
from pathlib import Path

processed_dir = Path("data") / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

output_path = processed_dir / "tables-charts_chunks.csv"
df_chunks.to_csv(output_path, index=False)

output_path, output_path.exists()


(WindowsPath('data/processed/tables-charts_chunks.csv'), True)