In [None]:
import os
import time
import re
from pdf2image import convert_from_path, convert_from_bytes
import pytesseract
import json
import jamspell
from spellchecker import SpellChecker
import boto3
from nltk.tokenize import word_tokenize
from src import PROJECT_ROOT

import nltk
nltk.download('punkt')

In [None]:
def get_image1(file_path):
    """Get image out of pdf file_path. Splits pdf file into PIL images of each of its pages.
    """
    return convert_from_path(file_path, 500, thread_count=-1)

def get_image2(bytes_object):
    """Get image out of pdf file_path. Splits pdf file into PIL imagecommand string or list of command arguments to run inside the container after is created. The commands execute from the workspaceFolders of each of its pages.
    """
    return convert_from_bytes(bytes_object, 500, fmt="tiff",thread_count=-1)

def export_ocr(text, file, extract, out):
    """ Export ocr output text using extract method to file at out
    """
    filename = f'{os.path.splitext(os.path.basename(file))[0]}_{extract}.txt'
    with open(os.path.join(out, filename), 'w') as the_file:
        the_file.write(text)

def wrap_pagenum(page_text, page_num):
    """ Wrap page_text with page_num tag
    """
    return f"<p n={page_num}>" + page_text + "</p>"

def split_paragraphs(doc_text):
    """ Split extracted document text into paragraphs
    Replace \x0c (page break character) by \n. Match 1 or more occurrences of \n if
    preceeded by one occurrence of \n OR match 1 or more occurrences of \s 
    (whitespace) if preceeded by one occurrence of \n or match one occurrence of 
    \n if it isn't followed by \n.

    TODO: add a component to capture heards, footers, titles, so on... 
    (like Omdena did)
    """
    return re.sub("(?<=\n)\n+|(?<=\n)\s+|\n(?!\n)", " ", doc_text.replace("\x0c", "\n"))

def set_folder_tag(folder_number):
    return ("sv" + str(folder_number) + "/")

## Text extraction from pdfs

In [None]:
## Extraction from google drive folders
input_folder = os.path.join(PROJECT_ROOT, "tasks", "extract_text", "input")
output_folder = os.path.join(PROJECT_ROOT, "tasks", "extract_text", "output")

filepaths = [os.path.join(input_folder, file) for file in os.listdir(input_folder) if os.path.splitext(file)[1] == ".pdf"]
filepaths

In [None]:
# pytesseract extraction
for file in filepaths:
    pages = get_image1(file)
    text = ""  # initialize document text
    for pageNum, imgBlob in enumerate(pages):
        page_text = pytesseract.image_to_string(imgBlob, lang="spa")  # extract text
        text += wrap_pagenum(page_text, pageNum)  # wrap page number
    split_paragraphs(text)  # split document text into paragraphscommand string or list of command arguments to run inside the container after is created. The commands execute from the workspaceFolder
    export_ocr(text, file, "pytesseract", output_folder)  # write extracted text to disk

## Spellchecking with pyspellchecker
This model will classify each word as misspelled if it isn't contained in a spanish vocabulary.

Then it uses a Levenshtein Distance algorithm to find permutations within an edit distance of 2 from the original word. It then compares all permutations (insertions, deletions, replacements, and transpositions) to known words in a word frequency list. Those words that are found more often in the frequency list are more likely the correct results.

In [None]:
words = word_tokenize(text)

In [None]:
spell = SpellChecker(language='es')

# find those words that may be misspelled
misspelled = spell.unknown(words)

for word in misspelled:
    # Get the one `most likely` answer
    corr = spell.correction(word)
    # Get a list of `likely` options
    cand = spell.candidates(word)
    
    print(f"Mispelled word: {word} - Possible candidates: {cand} - Correction: {corr}")

## Spellcheking using JamSpell

- **Training the model is required as there's no pre-trained spanish jamspell model**. Instructions on how to do it at https://github.com/bakwc/JamSpell#train.

In [None]:
# Example of how to use JamSpell
corrector = jamspell.TSpellCorrector()

# Check if model file exists
modelFile = 'en.bin'
if os.path.exists(modelFile):
    corrector.LoadLangModel('en.bin')  # load the trained model

    corrector.FixFragment('I am the begt spell cherken!')
    # u'I am the best spell checker!'

    corrector.GetCandidates(['i', 'am', 'the', 'begt', 'spell', 'cherken'], 3)
    # (u'best', u'beat', u'belt', u'bet', u'bent', ... )

    corrector.GetCandidates(['i', 'am', 'the', 'begt', 'spell', 'cherken'], 5)
    # (u'checker', u'chicken', u'checked', u'wherein', u'coherent', ...)
else:
    print("modelFile contains inexistent path!")

## Extraction for S3 Bucket folder

In [None]:
# Connection to AWS
KEY = os.environ.get("S3_BUCKET")
SECRET = os.environ.get("SECRET_KEY")

s3 = boto3.resource(
    service_name = 's3',
    region_name = 'us-east-2',
    aws_access_key_id = KEY,
    aws_secret_access_key = SECRET
)

In [None]:
folder = 1 #This is the number of the folder in the S3 Bucket there are 10, from 1 to 10
i = 0
for obj in s3.Bucket('wri-latin-talent').objects.all().filter(Prefix='full'):
    if set_folder_tag(folder) in obj.key and obj.key.replace("full/" + set_folder_tag(folder), "") != "": #To run only over the desired folders
        print(i, "**", obj.key)
        key = "text-extraction/" + obj.key.replace("full/"+  set_folder_tag(folder), "") + ".txt"
        file = obj.get()['Body'].read() #get the file from S3
        pages = get_image2(file)
        text = ""  # initialize document text
        for pageNum, imgBlob in enumerate(pages):
            page_text = pytesseract.image_to_string(imgBlob, lang="spa")
            text += wrap_pagenum(page_text, pageNum)  # wrap page number
        content = split_paragraphs(text)  # extract text
        s3.Object('wri-latin-talent', key).put(Body = content)#This will save all the contents in the string variable "content" into a txt file in the Pre-processed folder
        i += 1