# Extract Text from PDF

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import tempfile

import pytesseract
from ipyfilechooser import FileChooser
from IPython.display import display
from pdf2image import convert_from_path
from tqdm import tqdm
from traiter.util import clean_text

from myrsidea.pylib.const import TESS_CONFIG

## Choose a PDF file

In [3]:
pdf_file = FileChooser('.')
pdf_file.use_dir_icons = True
display(pdf_file)

FileChooser(path='.', filename='', title='HTML(value='', layout=Layout(display='none'))', show_hidden='False',…

## Choose an output text file

In [4]:
text_file = FileChooser('.')
name = pdf_file.selected_filename if pdf_file.selected_filename else ''
text_file.default_filename = f'{name}.txt' if name else ''
text_file.use_dir_icons = True
display(text_file)

FileChooser(path='.', filename='Price_louse.pdf.txt', title='HTML(value='', layout=Layout(display='none'))', s…

## Extract using pytesseract

Pytesseract only reads text from images, so we need to convert each page of the PDF to an image and use pytesseract on that.

### Convert PDF to image(s)

In [5]:
pages = []
with tempfile.TemporaryDirectory() as temp_dir:
    images = convert_from_path(pdf_file.selected, output_folder=temp_dir)
    for image in tqdm(images):
        page = pytesseract.image_to_string(image, config=TESS_CONFIG)
        page = clean_text(page, replace={'(*': '(X'})
        pages.append(page)

100%|██████████| 188/188 [12:02<00:00,  3.84s/it]


In [6]:
paper = '\n\n'.join(pages)
paper = re.sub(r'(\S)\n(\S)', r'\1 \2', paper)

with open(text_file.selected, 'w') as txt_file:
    txt_file.write(paper)

## Manually edit the text

Remove headers and footers and figure captions. We need to do this manually for now.

## Normalize the text

In [23]:
with open(text_file.selected) as txt_file:
    paper = txt_file.read()

paper = re.sub(r'(\S)\n(\S)', r'\1 \2', paper)
paper = re.sub(r'\n\n+', r'\n', paper)

with open(text_file.selected, 'w') as txt_file:
    txt_file.write(paper)