# Modules for PDF Scraping

In [1]:
#PDF Miner (PDF Scraping)
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import HTMLConverter, TextConverter, XMLConverter
from io import StringIO
from pdfminer.pdfpage import PDFPage


#PyPDF2 (PDF Scraping)
import PyPDF2 as pypdf


#OCR
from pdf2image import convert_from_path
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd=r'Tesseract-OCR\tesseract.exe'


#NLP Preprocessing
import nltk
import re
import regex
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


#NLP Processing TensorFLow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

#NLP Processing Spacy
import spacy
import spacy_transformers
spc = spacy.load('en_core_web_trf')
# spc = spacy.load(en_core_web_lg)
# spc.add_pipe('sentencizer')

#Model Training and Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#Basic
import numpy as np
import pandas as pd
import pickle
import glob
import os
import docker
import string

pc = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~'

  "class": algorithms.Blowfish,


# Functions for PDF Text Scraping

## PDF-Miner (Unscanned PDF)

In [9]:
def get_pdf_file_content_Text(path_to_pdf):
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''

    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''

    resource_manager = PDFResourceManager(caching=True)

    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()

    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'

    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams(line_overlap=0.3, detect_vertical=True)

    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')

    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)

    '''
    We are going to process the content of each page of the original PDF File
    '''
    try:
        for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True):
            interpreter.process_page(page)
    except:
        for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password=b"", caching=True, check_extractable=True):
            interpreter.process_page(page)

    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()

    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()

    '''
    Return the final variable containing all the text of the PDF
    '''
    return text

## Group PDF Scraping for normal method

In [8]:
def group_read_pdfminer(folder_path):
    all_extracted_text={}
    for i,filename in enumerate(glob.glob(os.path.join(folder_path, '*.pdf'))):
        try:
            all_extracted_text[re.search(r'(\w+).pdf$',filename).group(1) +'_'+ str(i)] = get_pdf_file_content_Text(filename).lower()
        except:
            pass

## OCR Tesseract (Scanned Document)

In [None]:
def get_pdf_file_content_Text_ocr(pdf_path)
    #Convert pdf to images
    images = convert_from_path(pdf_path, 500,poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
    output_path = 'ocr_out/'
    for i, image in enumerate(images):
        fname = output_path + 'image_'+str(i)+'.png'
        image.save(fname, "PNG")

    extracted_text = []
    #images to text
    for i,filename in enumerate(glob.glob(os.path.join(output_path, '*.png'))):
        try:
            img1 = cv2.imread(filename)
            extracted_text.append(pytesseract.image_to_string(img1))
            print('successful'+str(i))
        except:
            print('unsuccessful'+str(i))

    text = ' '.join(extracted_text)
    
    return text

## Group PDF Scraping for OCR method

In [None]:
def group_read_ocr(folder_path):
    all_extracted_text={}
    for i,filename in enumerate(glob.glob(os.path.join(folder_path, '*.pdf'))):
        try:
            all_extracted_text[re.search(r'(\w+).pdf$',filename).group(1) +'_'+ str(i)] = get_pdf_file_content_Text_ocr(filename).lower()
        except:
            pass

# Function for Preprocessing 1
### Regex Conditioning for Scientific Text

In [None]:
def preprocessing_1(text_pdfmine):
    #Punctuation of '-' because make a newline 
    text_punew = re.sub(r'(-\n?\s)','',text_pdfmine.lower())
    
    #Remove spacing n dot '\n' & '•'
    if bool(regex.search(r'(?<=(\w\w))[\n•](?=(\w\w))',text_punew))==True:
        text_punew_spdo = re.sub(r'[\n•]', ' ', text_punew)
    else:
        text_punew_spdo = re.sub(r'[\n•]', '', text_punew)
        
    #Adjustment in Germany alphabetic
    text_punew_spdo_ger = re.sub(r'(?<=[oua])(¨)','e',text_punew_spdo)
    text_punew_spdo_ger = re.sub(r'ß','ss',text_punew_spdo_ger)
    
    #Adjusment in Over-Spacing
    text_punew_spdo_ger_ovsp = re.sub(r'(\s)(?=\s+)','',text_punew_spdo_ger)
    
    #Delete et al. + lower
    text_punew_spdo_ger_ovsp_etal = regex.sub(r'(\w+\s+et al.)','',text_punew_spdo_ger_ovsp)
    
    #Eliminate fig and table
    text_punew_spdo_ger_ovsp_etal = re.sub(r'ﬁ','fi',text_punew_spdo_ger_ovsp_etal)
    text_punew_spdo_ger_ovsp_etal_fig = re.sub(r'(table|tables|figure|figures|fig.?|figs.)\s+(\d+|\d+.)', '', text_punew_spdo_ger_ovsp_etal)
    
    #Eliminate website
    text_punew_spdo_ger_ovsp_etal_fig_emai_web = regex.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w\s_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','',text_punew_spdo_ger_ovsp_etal_fig_emai)
    
    return text_punew_spdo_ger_ovsp_etal_fig_emai_web

# Function for Preprocessing 2 
### (Sentences seperation (by SpaCy) with Conditioning, Stemmer / Lemmatizer, & Stopwords)

In [3]:
def preprocessing_2(text_punew_spdo_ger_ovsp_etal_fig_emai_web):
    #Splitted into sentences by Spacy
    spc_text = spc(text_punew_spdo_ger_ovsp_etal_fig_emai_web)

    #Preparation
    prepro1 = []
    col_intro = []
    stop_ack_stc = []
    stop_ref_stc = []
    name = []
    prepro2=[]
    
    #Read the file Stopwords
    gist_file = open("gist_stopwords.txt", "r")
    try:
        content = gist_file.read()
        stopwords = content.split(",")
    finally:
        gist_file.close()

    #List all possible name with spaCy
    for i in spc_text.ents:
        if i.label_.lower() == 'person':
            name.append(i.text)
        else:
            pass
    
    #Pattern for combination of alphabet and digit in word
    pca = '!"#$%&\*+/:;<=>?@[\]^_`{|}~'
    pattern_comb = regex.compile(r'([a-z]+[\d]+|\d+[a-z]+|\d+[{}]+|[a-z]+[{}]+|[{}]+[a-z]+|[{}]+[\d]+)'.format(pca,pca,pca,pca))
    
    #List of Sentences with application of lemmatization
    for i,j in enumerate(spc_text.sents):

        #Eliminate word - number - punc-chars combination
        sent_lem = regex.sub(pattern_comb,'',j.lemma_)

        #delete name
        if len(name) != 0:
            try:
                sent_lem = regex.sub(r",*(\s*\b(?:{}))\b".format("|".join(name)),'',sent_lem)
            except:
                pass
        else:
            pass

        prepro1.append(sent_lem)

        #Search for the start pf acknowledgements and references
        if bool(re.search('(acknowledgments|acknowledgment)',sent_lem)) == True:
            stop_ack_stc.append(i)

        if bool(re.search('(references|reference)',sent_lem)) == True:
            stop_ref_stc.append(i)

        elif bool(re.search('(introduction)',sent_lem)) == True:
            col_intro.append(i)

    #Cut parts before Introduction and after the acknowledgments or references
    try:
        if len(stop_ack_stc) == 0:
            try:
                prepro1 = prepro1[col_intro[0]:stop_ref_stc[-1]]
            except:
                prepro1 = prepro1[:stop_ref_stc[-1]]
        else:
            try:
                prepro1 = prepro1[col_intro[0]:stop_ack_stc[-1]]
            except:
                prepro1 = prepro1[:stop_ack_stc[-1]]
    except:
        pass

    #Pattern for Remove Citation
    pattern1 = regex.compile(r'\(([\w\s\d{}]+)\)'.format(pc))

    #Pattern for Apply the Stopwords
    pattern2 = r",*(\s*\b(?:{}))\b".format("|".join(stopwords))

    #Pattern for unused space first n last
    pattern3 = regex.compile(r'((^\s+)(?=.)|(?<=.)(\s+$))')

    #Pattern for Apply over-space
    pattern4 = regex.compile(r'(\s)(?=\s+)')

    #Pattern Additional compiler
    pattern_add = regex.compile(r'(\x01|\x0c|\s[a-z]\s|°|^\b(\w\s)|I|)')

    #Pattern for combined word and number
    for i,j in enumerate(prepro1):
        if bool(regex.search(pattern1,j)) == True:
            if bool(regex.search(r'\d',regex.search(pattern1,j).group())) == True:
                try:
                    layer = regex.findall(pattern1,j)
                    if type(layer) == tuple:
                        layer = list(filter(None, [i for i in layer]))
                        prepro2.append(re.sub(layer[0],'',j))
                    else:
                        prepro2.append(re.sub(regex.findall(pattern1,j)[0],'',j))
                except:
                    prepro2.append(j)
            else:
                prepro2.append(j)
        else:
            prepro2.append(j)

        #Weird character of ﬂ
        layer = regex.sub(r'ﬂ','fl',prepro2[i])

        #With Removing number and punctuations    
        layer = regex.sub(r'(\d|[^A-Za-zöäüéíáúóðèñæýßôþ\s]|[^\P{P}]+)',' ',re.sub(pattern2,'', layer)) 

        #With Removing unused space first n las
        layer = regex.sub(pattern3,'',layer)

        #With Removing over-space
        layer = regex.sub(pattern4,'',layer)

        #With Removing Additional compiler
        prepro2[i] = regex.sub(pattern_add,'',layer)

    #Final filtering for less than three words sentence
    d = lambda y : None if len(y.split(' '))<=4 else y
    prepro2 = list(filter(d, prepro2))
    
    return prepro2

# Function to build NER Library

In [7]:
def read_input_entities(folder_path):
    dict_entities={}
    for filename in glob.glob(os.path.join(folder_path, '*.txt')):
        gist_file = open(filename, "r")
        try:
            content = gist_file.read().lower()
            dict_entities[re.search(r'(\w+).txt$',filename).group(1)] = list(filter(None, content.split("\n")))
        finally:
            gist_file.close()
            
def create_lib_entities(prepro2):
    #Ready for NLP NER
    ready_ner = {'text':[],'tag':[],'entity':[]}
    for i in prepro2:
        for j in dict_entities:
            pattern = r"(\.*\b(?:{}))\b".format("|".join(dict_entities[j]))
            if bool(re.search(pattern, i)) == True:
                for k in re.findall(pattern,i):
                    if type(k) == tuple:
                        k = list(filter(None,[i for i in k]))
                        for m in k:
                            ready_ner['text'].append(i)
                            ready_ner['tag'].append(m)
                            ready_ner['entity'].append(re.search(r'list_(\w+)$',j).group(1))
                    else:
                        ready_ner['text'].append(i)
                        ready_ner['tag'].append(k)
                        ready_ner['entity'].append(re.search(r'list_(\w+)$',j).group(1))
            else:
                pass
    return ready_ner