# Extract Documents

Extract the text from the PDFs and store the text as pickles


In [None]:
import spacy
import textract
from tqdm.notebook import tqdm
from collections import Counter, defaultdict
import pickle
from pathlib import Path
import random
spacy.__version__

In [None]:
from spacy.tokens import DocBin, Doc
Doc.set_extension("pdf_path", default=None)


In [None]:
from pathlib import Path

root = '/Users/eric/proj/data-projects/text-classification/source-data/homelessness-contracts/pdfs'

files = list(Path(root).glob('**/*.pdf'))
d = { e.name:e for e in files }

pdfs = list(d.values())
len(pdfs)

In [None]:

pth  = Path('texts.pkl')

if not pth.exists():
    texts = {}
    open_errors = []
    for pdf_path in tqdm(pdfs):

        try:
            texts[pdf_path] = textract.process(pdf_path)
        except TypeError as e:
            open_errors.append( (pdf_path, e) )
            continue

    print(len(texts), len(open_errors))

    with pth.open('wb') as f:
        pickle.dump(texts, f)


        

In [None]:
model = "en_core_web_lg"

text_path  = Path('texts.pkl')
docs_path  = Path(f'docs-{model}.spacy')
nlp_path = Path(f'nlp-{model}.spacy')


nlp = spacy.load(model)
nlp.max_length = 3000000

if not nlp_path.exists():
    

    nlp_errors = []
    other_errors = []

    with pth.open('rb') as f:
        texts = pickle.load(f)

    docs = {}
    for pdf_path, text in tqdm(list(texts.items())):

        try:
            doc =  nlp(text.decode('utf-8') )
            doc._.pdf_path = str(pdf_path)
            docs[pdf_path] =  doc
        except ValueError as e:
            nlp_errors.append( (pdf_path, e) )
        except Exception as e:
            other_errors.append( (pdf_path, e) )

    nlp.to_disk(nlp_path)
    print(len(docs),  len(nlp_errors), len(other_errors))
else:
    nlp = nlp.from_disk(nlp_path)



In [None]:
doc_bin = DocBin(docs=[e for _, e in docs.items() if isinstance(e, Doc)],  store_user_data=True)
doc_bin.to_disk(docs_path)
    