In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Repo: https://github.com/atlanhq/camelot

**Comparsion with other OpenSource libraries**

https://github.com/camelot-dev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools

### Basic Use

In [2]:
# install Camelot 
# !pip install camelot-py

# install OpenCV to address dependencies issues
# !pip install opencv-python

In [3]:
# import camelot
import camelot

In [4]:
# reading the PDF file
tables = camelot.read_pdf('foo.pdf')

In [5]:
# number of tables 
tables.n

1

In [6]:
# exporting all the tables to zip file
tables.export('foo.csv', f='csv', compress=True) # json, excel, html

In [7]:
# exporting specific table to CSV / JSON / EXCEL / HTML
tables[0].to_csv('foo.csv') # to_json, to_excel, to_html

### Other useful Methods

API Reference: https://camelot-py.readthedocs.io/en/master/api.html

In [8]:
tables.n

1

In [9]:
# shape of first table
tables[0].shape

(7, 7)

In [10]:
# get a pandas DataFrame!
tables[0].df

Unnamed: 0,0,1,2,3,4,5,6
0,Cycle \nName,KI \n(1/km),Distance \n(mi),Percent Fuel Savings,,,
1,,,,Improved \nSpeed,Decreased \nAccel,Eliminate \nStops,Decreased \nIdle
2,2012_2,3.30,1.3,5.9%,9.5%,29.2%,17.4%
3,2145_1,0.68,11.2,2.4%,0.1%,9.5%,2.7%
4,4234_1,0.59,58.7,8.5%,1.3%,8.5%,3.3%
5,2032_2,0.17,57.8,21.7%,0.3%,2.7%,1.2%
6,4171_1,0.07,173.9,58.1%,1.6%,2.1%,0.5%


In [11]:
# accuracy  -- part of parsing report
tables[0].accuracy

99.02369482929429

In [12]:
tables[0].parsing_report

{'accuracy': 99.02, 'whitespace': 12.24, 'order': 1, 'page': 1}

In [13]:
# table number on page
tables[0].order

1

In [14]:
# PDF page number of table
tables[0].page

1

In [15]:
# returns 2-dimensional list of strings in table
tables[0].data

[['Cycle \nName',
  'KI \n(1/km)',
  'Distance \n(mi)',
  'Percent Fuel Savings',
  '',
  '',
  ''],
 ['',
  '',
  '',
  'Improved \nSpeed',
  'Decreased \nAccel',
  'Eliminate \nStops',
  'Decreased \nIdle'],
 ['2012_2', '3.30', '1.3', '5.9%', '9.5%', '29.2%', '17.4%'],
 ['2145_1', '0.68', '11.2', '2.4%', '0.1%', '9.5%', '2.7%'],
 ['4234_1', '0.59', '58.7', '8.5%', '1.3%', '8.5%', '3.3%'],
 ['2032_2', '0.17', '57.8', '21.7%', '0.3%', '2.7%', '1.2%'],
 ['4171_1', '0.07', '173.9', '58.1%', '1.6%', '2.1%', '0.5%']]

### Advanced Usage

API Reference: https://camelot-py.readthedocs.io/en/master/user/advanced.html

The naming for parsing methods inside Camelot (i.e. Lattice and Stream) was inspired from Tabula. **Lattice** is used to parse tables that have demarcated lines between cells, while **Stream** is used to parse tables that have whitespaces between cells to simulate a table structure.

Need to process the background for this PDF 

In [21]:
tb1 = camelot.read_pdf("https://camelot-py.readthedocs.io/en/master/_static/pdf/background_lines.pdf", process_background = True)

In [25]:
tb1.n

2

In [26]:
tb1[1].df

Unnamed: 0,0,1,2,3,4,5,6,7
0,State,Date,Halt \nstations,Halt \ndays,Persons \ndirectly \nreached\n(in lakh),Persons \ntrained,Persons \ncounseled,Persons \ntested\nfor HIV
1,Delhi,1.12.2009,8,17,1.29,3665,2409,1000
2,Rajasthan,2.12.2009 to \n19.12.2009,,,,,,
3,Gujarat,20.12.2009 to \n3.1.2010,6,13,6.03,3810,2317,1453
4,Maharashtra,4.01.2010 to \n1.2.2010,13,26,1.27,5680,9027,4153
5,Karnataka,2.2.2010 to \n22.2.2010,11,19,1.80,5741,3658,3183
6,Kerala,23.2.2010 to \n11.3.2010,9,17,1.42,3559,2173,855
7,Total,,47,92,11.81,22455,19584,10644
