In [7]:
import pandas as pd
from pathlib import Path

cwd = Path().resolve()

chunks_path = cwd / "data" / "processed" / "tables-charts_chunks.csv"
if not chunks_path.exists():
    chunks_path = cwd.parent / "data" / "processed" / "tables-charts_chunks.csv"

print("Chunks path:", chunks_path)
print("Exists:", chunks_path.exists())

df_chunks = pd.read_csv(chunks_path)
print("Loaded text chunks:", df_chunks.shape)
df_chunks.head()


Chunks path: C:\Users\SEC\OneDrive\Desktop\docinsight\notebooks\data\processed\tables-charts_chunks.csv
Exists: True
Loaded text chunks: (65, 6)


Unnamed: 0,chunk_id,page_number,start_block_index,end_block_index,text,char_len
0,0,0,1,18,"Tables, Charts and Graphs\nUNIT 5\nTABLES, CHA...",594
1,1,0,19,22,"5.1\nINTRODUCTION\nWe have, in the various uni...",434
2,2,0,23,23,"a)\nWhen you decide to use tables, charts and ...",610
3,3,0,24,0,b)\nLet us now try and understand the function...,399
4,4,1,1,3,Writing Skills\n\nThese devices enable you to...,246


In [1]:
import camelot
from pathlib import Path

# Determine pdf path same as before:
cwd = Path().resolve()
pdf_path = cwd / "data" / "raw" / "tables-charts.pdf"
if not pdf_path.exists():
    pdf_path = cwd.parent / "data" / "raw" / "tables-charts.pdf"

print("PDF path:", pdf_path)
print("Exists:", pdf_path.exists())


PDF path: C:\Users\SEC\OneDrive\Desktop\docinsight\data\raw\tables-charts.pdf
Exists: True


In [2]:
tables = camelot.read_pdf(str(pdf_path), pages="all")
print("Tables detected:", tables.n)


Tables detected: 8


In [3]:
for i in range(tables.n):
    print("==== TABLE", i, "====")
    display(tables[i].df.head())


==== TABLE 0 ====


Unnamed: 0,0
0,Viral hepatitis type B\nViral hepatitis type A...
1,These viruses are present in\nVirus is present...
2,In some parts of Africa and\nWorldwide. In som...
3,"Groups at particular risks\nMale homosexuals, ..."
4,A few weeks to several\nIncubation period\nThe...


==== TABLE 1 ====


Unnamed: 0,0
0,Viral hepatitis type B\nViral hepatitis type A...
1,Screening of blood donors and\nRecommended for...


==== TABLE 2 ====


Unnamed: 0,0,1,2,3,4,5,6
0,Age range,Body\nWt. Kg.,Energy\nKcal,MJ,Protein\ng,Calcium\nmg,Iron\nmg
1,Boys and\nGirls\n0 up to 1 year\n2 up to 4 yea...,7.3\n13.5\n20.5\n31.9\n45.5\n61.0\n33.0\n48.6\...,800\n1400\n1800\n2500\n2800\n3000\n2300\n2300\...,3.3\n5.9\n7.5\n10.5\n11.7\n12.6\n9.6\n9.6\n9.6,20\n35\n45\n63\n70\n75\n58\n58\n58,600\n500\n500\n700\n700\n600\n700\n700\n600,6\n7\n8\n13\n14\n15\n13\n14\n15


==== TABLE 3 ====


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Non\nsmokers,Smokers of,,,,,,,,
1,,Cigars\nonly,Pipes\n only,"Cigarettes &\npipes, cigars",,,Cigarettes\nonly,,,
2,13\n10,13,39 39,98\n67,,,134\n127,,,


==== TABLE 4 ====


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,100,160\n123\n123\n123\n109\n123\n123\n123\n123\n1...,,,123\n206\n123\n123\n154\n123\n123\n123\n123\n1...,,123\n220\n123\n123\n175\n123\n123\n123\n123\n1...,,,,
1,Non\nsmokers,Stopped\nsmoking\nmore than\n10 years,,,Stopped\nsmoking\n1-10 years,,Still\nsmoking,,,,


==== TABLE 5 ====


Unnamed: 0,0,1,2,3
0,,,,
1,Non\nsmokers,1-4,15 or more,Ex\nsmokers
2,,Smokers: No. of\ncigarettes a day,,


==== TABLE 6 ====


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,123\n123\n‘Temperature rise in the ‘greenhouse...,,,,,,,,,,...,,,,,,,,,,


==== TABLE 7 ====


Unnamed: 0,0,1,2
0,,6\nanaerobic\ndigestion\nby microbes,
1,,,


In [4]:
import pandas as pd

table_chunks = []

for i, table in enumerate(tables):
    df = table.df
    
    # Convert DataFrame to markdown-like text
    table_text = df.to_string(index=False)
    
    table_chunks.append({
        "chunk_id": f"table_{i}",
        "chunk_type": "table",
        "text": table_text,
        "page_number": table.parsing_report["page"],  # Camelot stores page index
        "char_len": len(table_text)
    })


In [5]:
df_table_chunks = pd.DataFrame(table_chunks)
df_table_chunks.head()


Unnamed: 0,chunk_id,chunk_type,text,page_number,char_len
0,table_0,table,...,2,4206
1,table_1,table,...,3,1115
2,table_2,table,...,4,1355
3,table_3,table,0 1 2 ...,8,375
4,table_4,table,0 ...,8,812


In [8]:
df_text = df_chunks.copy()
df_text["chunk_type"] = "text"
df_text["chunk_id"] = df_text["chunk_id"].astype(str)

df_master = pd.concat([df_text, df_table_chunks], ignore_index=True)
print(df_master.shape)
df_master.head()


(73, 7)


Unnamed: 0,chunk_id,page_number,start_block_index,end_block_index,text,char_len,chunk_type
0,0,0,1.0,18.0,"Tables, Charts and Graphs\nUNIT 5\nTABLES, CHA...",594,text
1,1,0,19.0,22.0,"5.1\nINTRODUCTION\nWe have, in the various uni...",434,text
2,2,0,23.0,23.0,"a)\nWhen you decide to use tables, charts and ...",610,text
3,3,0,24.0,0.0,b)\nLet us now try and understand the function...,399,text
4,4,1,1.0,3.0,Writing Skills\n\nThese devices enable you to...,246,text


In [9]:
master_path = cwd / "data" / "processed" / "tables-charts_master_chunks.csv"
df_master.to_csv(master_path, index=False)
master_path


WindowsPath('C:/Users/SEC/OneDrive/Desktop/docinsight/notebooks/data/processed/tables-charts_master_chunks.csv')