-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_processing_demo.py
91 lines (56 loc) · 2.19 KB
/
pdf_processing_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# pdf_processing_demo.py
# -*- coding: utf-8 -*-
# @Author: Sidharth Mishra
# @Date: 2017-03-06 17:14:07
# @Last Modified by: Sidharth Mishra
# @Last Modified time: 2017-04-01 19:14:37
__author__ = 'sidmishraw'
__email__ = 'sidharth.mishra@sjsu.edu'
'''
This is a demo for the PDF processing module.
'''
# Python standard library imports
from pprint import pprint
from json import dumps
from json import loads
from pdb import set_trace
# cs_267_project specific imports
from pdf_processing import extract_pages
from pdf_processing import extract_page_contents
from pdf_processing import get_pdf_contents
from pdf_processing import create_json_file
from pdf_processing import extract_words
from pdf_processing import build_pdf_json
from pdf_processing import cleanse_extracted_words
from pdf_processing import cleansed_pdf_json
# Some constants
from pdf_processing.pdf_processor import TEST_PDF
from pdf_processing.pdf_processor import TEST_PDF_2
if __name__ == '__main__':
'''
Using the pdf `obscalculi_testing_pdf_conv.pdf` as the sample for the demo.
'''
print('Converting pdfs into JSONs and making your life simpler...')
# creates the JSON file for the PDF document's pages mapped to their contents.
# create_json_file('phase1.json')
# Extracts the text from PDF and converts to a JSON preserving the ordering of the words
# as they were found from the PDF.
doc_name = TEST_PDF
get_pdf_contents(TEST_PDF)
def_dict = extract_words()
with open('phase3_1.json', 'w') as fp_open:
fp_open.write(dumps(def_dict))
def_dict = cleanse_extracted_words(def_dict)
with open('phase3.json', 'w') as fp_open:
fp_open.write(dumps(def_dict))
# Extracts the text from the PDF and groups them together according to their fonts, font-weight,
# size and behavior in the PDF document. Then cleanses them so the PDF found order is not
# maintained but the grouped order is maintained.
doc_name = TEST_PDF_2
get_pdf_contents(TEST_PDF_2)
def_dict = build_pdf_json()
with open('phase3_2_1.json', 'w') as fp_open:
fp_open.write(dumps(def_dict))
def_dict = cleansed_pdf_json(def_dict)
with open('phase3_2.json', 'w') as fp_open:
fp_open.write(dumps(def_dict))