In [2]:
# import libraries
import pandas as pd
import camelot

## Get tables from pdf file


In [3]:
url_file = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"


def get_table_from_pdf(
    path_file: str = url_file, method="lattice"):
    # Get the tables in the PDF file
    if path_file.endswith(".pdf"):
        try:
            tables = camelot.read_pdf(filepath=path_file, pages="all", flavor=method)
            print(f"{path_file} successfully loaded!")
            if len(tables) > 0:
                print(f"{len(tables)} tables extracted")
                return tables
            else:
                raise Exception("No table extracted!")
        except Exception as e:
            print(e)
            print("Try change -method- parameter to 'stream'!")
            # tables = camelot.read_pdf(filepath=url_file, pages="all", flavor="lattice")
    else:
        raise Exception("File is not a PDF")


In [4]:
# set path to pdf file
pdf_file = "files/DRI_Micronutriments.pdf"

tables = get_table_from_pdf(method="stream")


https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf successfully loaded!
1 tables extracted


## Insight Extracted Tables


In [4]:
def display_tables_info(tables) -> pd.DataFrame:
    assert len(tables) > 0, "No tables Found!"
    table_infos = [
        table.parsing_report | {"n_rows": table.shape[0], "n_cols": table.shape[1]}
        for table in tables
    ]
    return pd.DataFrame(table_infos)


display(display_tables_info(tables))


Unnamed: 0,accuracy,whitespace,order,page,n_rows,n_cols
0,99.73,31.02,1,1,34,22
1,99.8,21.08,1,2,31,15
2,100.0,0.0,1,3,10,1
3,99.43,23.67,1,4,33,16
4,99.5,23.9,1,5,34,8
5,89.76,33.33,1,6,9,3
6,99.43,55.11,1,7,55,16
7,100.0,0.0,1,8,6,1
8,99.73,38.47,1,9,41,22
9,100.0,0.0,1,10,8,1


## View a table as Data Frame


In [6]:
# print the first table as Pandas DataFrame
df_table = tables[0].df
display(df_table)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,F,"ood and Nutrition Board, National Academies of...",,,,,,,,,...,,,,,,,,,,
1,L,,,,V,V,,V,T,Ribo-,...,F,Vit,,I,I,Magnes-,Molyb-,Phos-,Sele-,
2,ife-Stage,,C,P,it A,it C,V,it E,hiamin,flavin,...,olate,B12,Copper,odine,ron,ium,denum,phorus,nium,Zinc
3,,Calcium,HO,rotein,,,it D,,,,...,,,,,,,,,,
4,Group,(mg/d),(g/d),(g/kg/d),(g/d)a,(mg/d),(g/d),(mg/d)b,(mg/d),(mg/d),...,(g/d)d,(g/d),(g/d),(g/d),(mg/d),(mg/d),(g/d),(mg/d),(g/d),(mg/d)
5,Infants,,,,,,,,,,...,,,,,,,,,,
6,0–6 mo,,,,,,,,,,...,,,,,,,,,,
7,7–12 mo,,,1.0,,,,,,,...,,,,,6.9,,,,,2.5
8,Children,,,,,,,,,,...,,,,,,,,,,
9,1–3 y,500,100,0.87,210,13,10,5,0.4,0.4,...,120,0.7,260,65,3.0,65,13,380,17,2.5


## Export tables in csv format into zip file


In [5]:
tables.export(
    "files/data.csv", f="csv", compress=True
)  # json, excel, html, markdown, sqlite
