In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from pdf2image import convert_from_path
from PIL import Image
from watchdog.events import PatternMatchingEventHandler
from watchdog.observers import Observer

import numpy as np
import os
import pandas as pd
import pytesseract
import re
import requests
import string
import sys
import time

# To-Do
1. (DONE) Create a bulk OCR and indexing process.
    - (DONE) Test if FOR loop is working. FOR loop only goes after pdf files within the same directory level. It doesn't drill to subdirectories.
    - (DEBT) Scenario test:
        - what if it's OCR is taking too long. Can we batch it up and resume?
        - **(NEXT)** what if indexing is too long?
               - (DONE) Have to use watcher/watchdog or other tools to monitor the addition of files in the folder.
               While the python code is running: 
               - (DONE) add an additional pdf file into the folder and see if it's added.
               -  **modify the filename of the file in the folder, see if only the index is changed.**
                   - Attempt to update the "page" of multiple pages for the one doc.
                   - Use 'GET /test_index_pdf2txt_new/_search' to verify in kibana
               - delete the file, see if it removes the index.
               - IDEA: Consider having one document to one index. This makes managing indices simpler by design and less memory.
               
        - (DEBT) Add parallel CPUs for the CV bit of Tesserect.
    - (DEBT) Can Elasticsearch Analyzer or kibana Discovery show line breaks \n
        - Ans: the \n will go in but suspect that the Analyzer removes them
    - **(NEXT)** How to create a link in Elasticsearch to open the actual pdf file?
        - (NEXT) Use the attachment ingestion instead
        - Find a way to add the path and filename into the Kibana.
        - Find a way to make the path and filename clickable as a hyperref.
        
2. Data enrichment
    - How can user directly correct the index itself?
         - Use can correct the corpus itself and annotate the corpus filename.
         - Exception if-else has to be programmed when the original PDF content is changed or if Tesserect is upgraded.
         - If we break up the process to CV and corpus indexing, ideally there should be two monitoring at the PDF level and the courpus level.
    - How do we tag that this version of the index has been annotated.
    
3. Find out the OCR performance difference for these two codes:
    
    a. <code>pages = convert_from_path(pdf_path, 500, first_page=0, last_page=num_pages, poppler_path=r"poppler-0.68.0_x86/poppler-0.68.0/bin")</code>
    
    <code>pages = convert_from_path(pdf_path, poppler_path=r"poppler-0.68.0_x86/poppler-0.68.0/bin")</code>
    
    b. Apache Tika
    
    c. FSCrawler

4. **(NEXT)** How to productionize this in an .exe file?
 - Build a front end link: https://medium.com/analytics-vidhya/building-a-basic-search-engine-using-elasticsearch-fscrawler-97104c1ea220

5. How to build an index pattern from REST without using Kibana?


Note: Below utilizes the code lessons from
http://brunorocha.org/python/watching-a-directory-for-file-changes-with-python.html

In [2]:
path = r"C:\Users\Tian-Yan.Teh\Documents\1 - Projects\Commercial Text Analytics\20200922 code\PDF_Converter\Convert/"
print("Path", path)

Path C:\Users\Tian-Yan.Teh\Documents\1 - Projects\Commercial Text Analytics\20200922 code\PDF_Converter\Convert/


# Computer Vision (OCR)

In [3]:
#************************************************************
# Purpose: Converts one pdf into images, then into text by pages.
# Input: Path of one pdf file
# Output: A dictionary (docs) containing page and text (line break \n included)
#************************************************************
def pdf2txt(pdf_path):    

    d = {}
    docs = [] #Empty list that will carry the "page" and "text" field. 
    
    pytesseract.pytesseract.tesseract_cmd = r'tesseract\tesseract.exe'
    
    images = convert_from_path(pdf_path, poppler_path=r"poppler-0.68.0_x86/poppler-0.68.0/bin")
    count = 0
    for image in images:
        output = pytesseract.image_to_string(image)

        # TO-DO: Take only a filename instead of the fullpath
        d['page'] = pdf_path[:-4]+'_Page_'+str((count+1))
        d['text'] = output
        docs.append(d.copy()) #docs = list of dictionary items.
        count += 1
        
    print("END")
    return docs

In [4]:
#************************************************************
# Purpose: Bulk (batch) converts all pdfs in a folder into images, then into text by pages.
# Input: Path of pdf files
# Output: A dictionary (docs) containing page and text (line break \n included)
#************************************************************
# def pdf2txtbatch(path):
    
#     #files = List of all filenames.
#     files = os.listdir(path)
#     d = {} #why is it in an dictionary?
#     docs = []
    
#     #files_pdf = List of all filenames of PDFs files.
#     files_pdf = [f for f in files if f[-3:] == 'pdf'] 
    
#     pytesseract.pytesseract.tesseract_cmd = r'tesseract\tesseract.exe'
    
#     #files_pdf = List of all filenames of PDFs files.
#     #file_pdf = file names of each PDF.
#     for file_pdf in files_pdf:
#         #Render pdfs to images using Poppler
#         #images = list of document images/pages
#         images = convert_from_path(file_pdf, poppler_path = r"poppler-0.68.0_x86/poppler-0.68.0/bin")
#         count = 0
        
#         for image in images:
#             output = pytesseract.image_to_string(image)
            
#             d['page'] = file_pdf[:-4]+'_Page_'+str((count+1))
#             d['text'] = output
#             docs.append(d.copy()) #docs = list of dictionary items.
#             count += 1
            
#     print("END")
#     return docs

In [6]:
# es = Elasticsearch(['localhost:9200'])

# # TO-DO: To make this into a class.
# # Input: doc-ID, new file name.
# # Output: renamed method. 

# #************************************************************
# # client.update()
# #************************************************************
# # Purpose: Update an Elasticsearch Index
# # Link: https://kb.objectrocket.com/mongo-db/how-to-use-python-to-update-api-elasticsearch-documents-259
# # Input: ?
# # Output: ?
# #************************************************************

# source_to_update = {
#     # ""doc"" is essentially Elasticsearch's ""_source"" field
#     "doc" : {
#         "page" : r"C:\Users\Tian-Yan.Teh\Documents\1 - Projects\Commercial Text Analytics\20200922 code\PDF_Converter\Convert\Renamed_Page_1"
#     }
# }

# response = es.update(index='test_index_pdf2txt_new', doc_type='_doc', id='uq63TncBjQde-dUNKnS_', body=source_to_update)
# print ('response:', response)

In [7]:
# res = requests.get('http://localhost:9200/')
# print(res.content)

In [8]:
# xget = requests.get('http://localhost:9200/_cat/indices?v&pretty')
# print(xget.content)

In [9]:
# Have a use case for 1. modified, 2. created, 3. deleted, 4. moved
class MyHandler(PatternMatchingEventHandler):
    patterns=["*.pdf"]
    
    def __init__(self):
        super().__init__()

    def process(self, event):
        """
        event.event_type
            'modified' | 'created' | 'moved' | 'deleted'
        event.is_directory
            True | False
        event.src_path
            path/to/observed/file
        """
        
        # the file will be processed there
        print(event.src_path, event.event_type, type(event.event_type))  # print now only for debugging
        
        # Start the ElasticSearch client
        es = Elasticsearch(['localhost:9200'])
        
        if event.event_type == 'created':
            
            #Pseudocode:  Pasting new pdfs -> perform OCR -> convert to corpus -> index it
            # write a file_path one-by-one and index one by one without deleting the index.
            docs = pdf2txt(event.src_path)
            index = "test_index"

            ## elastic helper function to bulk index json
            bulk(es, docs, index=index, doc_type='_doc', raise_on_error=True)
            print("Created")
            
        elif event.event_type == 'deleted':
            print("Deleted")
            # Deleting existing pdfs / cut and paste elsewhere -> remove index
            # TO-DO: How do I get it to only delete the doc and not the entire index.
            
            #Code: to delete the entire index.
            #es.indices.delete(index_name)
            
        elif event.event_type == 'moved':
            print("Renamed")
            # Renamed the pdf -> modify index [old name] -> rename index to [new name]

            # if event.event_type == 'moved': [old name]
            # Retrieve for list of ids with that name.
            # if event.event_type == 'modified': [new name]
            # Bulk update for all those names with this id.
            
#             source_to_update = {
#                 # ""doc"" is essentially Elasticsearch's ""_source"" field
#                 "doc" : {
#                     "page" : r"C:\Users\Tian-Yan.Teh\Documents\1 - Projects\Commercial Text Analytics\20200922 code\PDF_Converter\Convert\Renamed_Again_Page_1"
#                 }
#             }

#             # TO-DO: Un-hardcode the id.
#             response = es.update(index='test_index', doc_type='_doc', id='uq63TncBjQde-dUNKnS_', body=source_to_update)
#             print ('response:', response)
            
        else:
            exit()
    
    def on_created(self, event): #Executed when a file or a directory is created
        self.process(event)
           
    def on_deleted(self, event): #Executed when a file or directory is deleted.
         self.process(event)
    
    def on_modified(self, event): #Executed when a file is modified or a directory renamed
         self.process(event)
        
    def on_moved(self, event): #Executed when a file or directory is moved
         self.process(event)


In [None]:
#************************************************************
#dirname
#************************************************************
# Commented for use only when a .py file is created.
#if __name__ == '__main__':
# args = sys.argv[1:]
# print("args:",args)
#
observer = Observer()
# observer.schedule(MyHandler(), path=args[0] if args else '.')
#************************************************************

#************************************************************
#dirname
#************************************************************
#*     The following hardcoing is necessary only for testing in .ipynb
dirname=os.path.dirname(path)
#*     Commented out because __file__ only works in a .py
#dirname=os.path.dirname(os.path.abspath(__file__))
#************************************************************

print("Dirname", dirname)
observer.schedule(MyHandler(), dirname) #Set recursive=True if we want to go to the subdirectories too
observer.start()

try:
    while True:
        time.sleep(1)

except KeyboardInterrupt:
    observer.stop()

observer.join()

Dirname C:\Users\Tian-Yan.Teh\Documents\1 - Projects\Commercial Text Analytics\20200922 code\PDF_Converter\Convert
C:\Users\Tian-Yan.Teh\Documents\1 - Projects\Commercial Text Analytics\20200922 code\PDF_Converter\Convert\28072020_MANUAL_PENGGUNA_(KIOSK_DAN_SISTEM_SMARTBOX)(1 min).pdf deleted <class 'str'>
Deleted
C:\Users\Tian-Yan.Teh\Documents\1 - Projects\Commercial Text Analytics\20200922 code\PDF_Converter\Convert\Shell Overview of Upstream Training (4 sec).pdf deleted <class 'str'>
Deleted
C:\Users\Tian-Yan.Teh\Documents\1 - Projects\Commercial Text Analytics\20200922 code\PDF_Converter\Convert\CA-dated 24.9.2003.pdf created <class 'str'>
END




Created
C:\Users\Tian-Yan.Teh\Documents\1 - Projects\Commercial Text Analytics\20200922 code\PDF_Converter\Convert\CA-dated 24.9.2003.pdf modified <class 'str'>


## Issue

Code: bulk(es, docs, index=index, doc_type='_doc', raise_on_error=True)
Error message: raise BulkIndexError("%i document(s) failed to index." % len(errors), errors)
elasticsearch.helpers.errors.BulkIndexError: ('52 document(s) failed to index.', [{'index': {'_index': 'test_index_pdf2txt', '_type': '_doc', '_id': 'Ba6tTXcBjQde-dUNTlsH', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': "failed to parse field [page] of type [long] in document with id 'Ba6tTXcBjQde-dUNTlsH'. Preview of field's value: 'C:\\Users\\Tian-Yan.Teh\\Documents\\1 - Projects\\Commercial Text Analytics\\20200922 code\\PDF_Converter\\Convert\\GSA dated 22.7.92_Page_1'", 'caused_by': {'type': 'illegal_argument_exception', 'reason': 'For input string: "C:\\Users\\Tian-Yan.Teh\\Documents\\1 - Projects\\Commercial Text Analytics\\20200922 code\\PDF_Converter\\Convert\\GSA dated 22.7.92_Page_1"'}}, 'data': {'page': 'C:\\Users\\Tian-Yan.Teh\\Documents\\1 - Projects\\Commercial Text Analytics\\20200922 code\\PDF_Converter\\Convert\\GSA dated 22.7.92_Page_1', 'text': 'GAS SALES AGREEMENT\n\nBETWEEN\n\nPETROLIAM NASIONAL BERHAD\n\nAND\n\nSARAWAK SHELL BERHAD\n\nFOR\n\nSUPPLY OF GAS TO LUTONG REFINERY\n\nwe\n\x0c'}}}, 

Findings: Used a different index. Not sure how this problem was solved in the end.

## Reference

1. Watching a directory for file changes with Python
http://brunorocha.org/python/watching-a-directory-for-file-changes-with-python.html

# Appendix

## Unused coding
## Single pdf converter and indexer

In [None]:
#************************************************************
# Purpose: Bulk converts all pdf in a folder into images, then into text by pages.
# Input: Path of pdf files
# Output: A dictionary (docs) containing page and text (line break \n included)
#************************************************************
def pdf2txtbatch(folder_path):
    
    #files = List of all filenames.
    files = os.listdir(folder_path)
    d = {} #Dictionary. Why this data structure?
    docs = []
    #files_pdf = List of all filenames of PDFs files.
    files_pdf = [f for f in files if f[-3:] == 'pdf'] 
    
    pytesseract.pytesseract.tesseract_cmd = r'tesseract\tesseract.exe'
    
    #files_pdf = List of all filenames of PDFs files.
    #file_pdf = file names of each PDF.
    for file_pdf in files_pdf:
        #Render pdfs to images using Poppler
        #images = list of document images/pages
        images = convert_from_path(file_pdf, poppler_path = r"poppler-0.68.0_x86/poppler-0.68.0/bin")
        count = 0
        
        for image in images:
            output = pytesseract.image_to_string(image)
            
            d['page'] = file_pdf[:-4]+'_Page_'+str((count+1))
            d['text'] = output
            docs.append(d.copy()) #docs = list of dictionary items.
            count += 1
            
    print("END")
    return docs

In [None]:
# Sandbox experiment
pdf_path = "test.pdf"
pdf2txt(pdf_path, 2)

In [None]:
# Main

es = Elasticsearch(['localhost:9200'])
#docs = tokens / corpus???
docs = pdf2txt(pdf_path)
index = "test_index_pdf2txt"

##good practice to delete an index if it already exists and you're overwriting
if es.indices.exists(index):
    es.indices.delete(index)

## elastic helper function to bulk index json
bulk(es, docs, index=index, doc_type='_doc', raise_on_error=True)


# Indexing into Elasticsearch

In [None]:
docs = pdf2txtbatch()
index = "test_index_pdf2txtbatch"

##good practice to delete an index if it already exists and you're overwriting
if es.indices.exists(index):
    es.indices.delete(index)

## elastic helper function to bulk index json
bulk(es, docs, index=index, doc_type='_doc', raise_on_error=True)