# Modules for Scraping

In [62]:
#PDF Miner (PDF Scraping)
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import HTMLConverter, TextConverter, XMLConverter
from io import StringIO
from pdfminer.pdfpage import PDFPage


#PyPDF2 (PDF Scraping)
import PyPDF2 as pypdf


#OCR
from pdf2image import convert_from_path
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd=r'Tesseract-OCR\tesseract.exe'


#NLP Preprocessing
import nltk
import re
import regex
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


#NLP Processing TensorFLow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

#NLP Processing Spacy
import spacy
import spacy_transformers
# spc = spacy.load('en_core_web_trf')
spc = spacy.load('en_core_web_lg')
# spc.add_pipe('sentencizer')

#Model Training and Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#Basic
import numpy as np
import pandas as pd
import pickle
import glob
import os
import docker
import string

pc = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~'

# Data Input for Scanned PDF

In [63]:
pdf_path = r"PDF Data/1920s/ajs.s5-7.38.81.pdf"

# OCR Tesseract

In [101]:
# Scanned PDF Read
def get_pdf_file_content_Text_ocr(pdf_path)
    #Convert pdf to images
    images = convert_from_path(pdf_path, 500,poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
    output_path = 'ocr_out/'
    for i, image in enumerate(images):
        fname = output_path + 'image_'+str(i)+'.png'
        image.save(fname, "PNG")

    extracted_text = []
    #images to text
    for i,filename in enumerate(glob.glob(os.path.join(output_path, '*.png'))):
        try:
            img1 = cv2.imread(filename)
            extracted_text.append(pytesseract.image_to_string(img1))
            print('successful'+str(i))
        except:
            print('unsuccessful'+str(i))

    text = ' '.join(extracted_text)
    
    return text

successful0
successful1
successful2
successful3
successful4
successful5
successful6
successful7
successful8
successful9
successful10
successful11
successful12
successful13
successful14
successful15
successful16
successful17
successful18
successful19
successful20
successful21


In [14]:
text_pdfmine = ' '.join(get_pdf_file_content_Text_ocr(pdf_path))
text_pdfmine

'THE\n\nAMERICAN JOURNAL OF SCIENCE\n[FIFTH SERIES.]\n\n———_#oe\n\nArt, VI.—The Thermal Conductivity and Compressi-\nbility of several Rocks under High Pressures; by\nP. W. Brineman.\n\nINTRODUCTION.\n\nThe effect of pressure on the thermal conductivity of\nrocks has apparently never been determined, although a\nknowledge of it is important in calculating the thermal\nequilibrium of the earth’s crust. The only measure-\nments we have that can give any idea of the possible\ninfluence of hydrostatic pressure are on the effect of one-\nsided compression on the thermal conductivity of granite\nand marble.t It1is of course well known that a one-sided\ncompression need not be at all like a hydrostatic pres-\nsure in its effects, and the only conclusion that we can\ndraw from the experiments of Lees as applied to condi-\ntions within the crust of the earth is that the effect of\npressure is probably not very large. .\n\nIn connection with my work on the effect of high pres-\nsures on general 

## Group PDF Scraping for OCR method

In [None]:
# #Group Read OCR
# all_extracted_text={}
# folder_patj = 'PDF Data/1920s/'
# for i,filename in enumerate(glob.glob(os.path.join(folder_path, '*.pdf'))):
#     try:
#         all_extracted_text[re.search(r'(\w+).pdf$',filename).group(1) +'_'+ str(i)] = get_pdf_file_content_Text_ocr(filename).lower()
#     except:
#         pass

# Data Input for Normal PDF

In [12]:
pdf_path = 'PDF Data/10s/Brehme2016_Article_PermeabilityDistributionInTheL.pdf'

# PDFMiner

In [13]:
def get_pdf_file_content_Text(path_to_pdf):
    
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''
    
    resource_manager = PDFResourceManager(caching=True)
    
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams(line_overlap=0.3, detect_vertical=True)
    
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)

    '''
    We are going to process the content of each page of the original PDF File
    '''
    try:
        for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True):
            interpreter.process_page(page)
    except:
        for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password=b"", caching=True, check_extractable=True):
            interpreter.process_page(page)

#Considering Vertical Written
#         if page.mediabox[2] - page.mediabox[0] > page.mediabox[3] - page.mediabox[1]:
#             orientation = 'Landscape'
#         else:
#             orientation = 'Portrait'

    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()

    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()
    
    '''
    Return the final variable containing all the text of the PDF
    '''
    return text

In [14]:
text_pdfmine = get_pdf_file_content_Text(pdf_path)
text_pdfmine

'Environ Earth Sci (2016) 75:1088\nDOI 10.1007/s12665-016-5878-9\n\nO R I G I N A L A R T I C L E\n\nPermeability distribution in the Lahendong geothermal ﬁeld:\nA blind fault captured by thermal–hydraulic simulation\n\nMaren Brehme1 • Guido Blo¨cher1 • Mauro Cacace1 • Yustin Kamah2 •\nMartin Sauter3 • Gu¨ nter Zimmermann1\n\nReceived: 24 November 2015 / Accepted: 4 July 2016 / Published online: 19 July 2016\nÓ Springer-Verlag Berlin Heidelberg 2016\n\nAbstract Subsurface ﬂuid ﬂow of reservoirs in active\ntectonic regions is mainly controlled by permeability of\nfault zones. Therefore, the characterization of fault zones is\nan important step toward performance assessment of a\nreservoir. The ﬂuid ﬂow is controlled also by pressure and\ntemperature conditions.\nIn this context, we simulated\npressure and temperature ﬁelds to elaborate on the inﬂu-\nence of permeability on subsurface ﬂuid ﬂow in the\nLahendong geothermal reservoir. Thermal–hydraulic sim-\nulation is performed using a ﬁn

## Group PDF Scraping for normal method

In [64]:
#Group Read PDF Miner
all_extracted_text={}
folder_path='PDF Data/10s/'
for i,filename in enumerate(glob.glob(os.path.join(folder_path, '*.pdf'))):
    try:
        all_extracted_text[re.search(r'(\w+).pdf$',filename).group(1) +'_'+ str(i)] = get_pdf_file_content_Text(filename).lower()
    except:
        pass
text_pdfmine = ' '.join(list(all_extracted_text.values()))
text_pdfmine

"tectonophysics 683 (2016) 124–137\n\ncontents lists available at sciencedirect\n\ntectonophysics\n\nj o u r n a l h o me p a g e : w w w . e l s e v i e r . c o m / l o c a t e / t e c t o\n\nexperimental investigations on the thermal conductivity characteristics\nof beishan granitic rocks for china's hlw disposal\nx.g. zhao a,⁎, j. wang a, f. chen a,b, p.f. li a, l.k. ma a, j.l. xie a, y.m. liu a\n\na cnnc key laboratory on geological disposal of high-level radioactive waste, beijing research institute of uranium geology, beijing 100029, china\nb school of civil and environmental engineering, university of science and technology beijing, beijing 100083, china\n\na r t i c l e\n\ni n f o\n\na b s t r a c t\n\narticle history:\nreceived 28 december 2015\nreceived in revised form 26 april 2016\naccepted 18 june 2016\navailable online 20 june 2016\n\nkeywords:\nthermal conductivity\nwater saturation\ntemperature\naxial stress\nbeishan granite\ngeological disposal\n\ncrystalline rocks are

## Data Preprocessing (Conditioning, Stop Words, Stemmer)

## Pre-Processing 1 : Regex Conditioning for Scientific Text

In [65]:
#Punctuation of '-' because make a newline 
text_punew = re.sub(r'(-\n?\s)','',text_pdfmine.lower())
text_punew

"tectonophysics 683 (2016) 124–137\n\ncontents lists available at sciencedirect\n\ntectonophysics\n\nj o u r n a l h o me p a g e : w w w . e l s e v i e r . c o m / l o c a t e / t e c t o\n\nexperimental investigations on the thermal conductivity characteristics\nof beishan granitic rocks for china's hlw disposal\nx.g. zhao a,⁎, j. wang a, f. chen a,b, p.f. li a, l.k. ma a, j.l. xie a, y.m. liu a\n\na cnnc key laboratory on geological disposal of high-level radioactive waste, beijing research institute of uranium geology, beijing 100029, china\nb school of civil and environmental engineering, university of science and technology beijing, beijing 100083, china\n\na r t i c l e\n\ni n f o\n\na b s t r a c t\n\narticle history:\nreceived 28 december 2015\nreceived in revised form 26 april 2016\naccepted 18 june 2016\navailable online 20 june 2016\n\nkeywords:\nthermal conductivity\nwater saturation\ntemperature\naxial stress\nbeishan granite\ngeological disposal\n\ncrystalline rocks are

In [66]:
#Remove spacing n dot '\n' & '•'
if bool(regex.search(r'(?<=(\w\w))[\n•](?=(\w\w))',text_punew))==True:
    text_punew_spdo = re.sub(r'[\n•]', ' ', text_punew)
else:
    text_punew_spdo = re.sub(r'[\n•]', '', text_punew)

text_punew_spdo

"tectonophysics 683 (2016) 124–137  contents lists available at sciencedirect  tectonophysics  j o u r n a l h o me p a g e : w w w . e l s e v i e r . c o m / l o c a t e / t e c t o  experimental investigations on the thermal conductivity characteristics of beishan granitic rocks for china's hlw disposal x.g. zhao a,⁎, j. wang a, f. chen a,b, p.f. li a, l.k. ma a, j.l. xie a, y.m. liu a  a cnnc key laboratory on geological disposal of high-level radioactive waste, beijing research institute of uranium geology, beijing 100029, china b school of civil and environmental engineering, university of science and technology beijing, beijing 100083, china  a r t i c l e  i n f o  a b s t r a c t  article history: received 28 december 2015 received in revised form 26 april 2016 accepted 18 june 2016 available online 20 june 2016  keywords: thermal conductivity water saturation temperature axial stress beishan granite geological disposal  crystalline rocks are potential host rock types for the 

In [67]:
#Germany alphabetic umlaut
text_punew_spdo_ger = re.sub(r'(?<=[oua])(¨)','e',text_punew_spdo)
text_punew_spdo_ger = re.sub(r'ß','ss',text_punew_spdo_ger)
text_punew_spdo_ger

"tectonophysics 683 (2016) 124–137  contents lists available at sciencedirect  tectonophysics  j o u r n a l h o me p a g e : w w w . e l s e v i e r . c o m / l o c a t e / t e c t o  experimental investigations on the thermal conductivity characteristics of beishan granitic rocks for china's hlw disposal x.g. zhao a,⁎, j. wang a, f. chen a,b, p.f. li a, l.k. ma a, j.l. xie a, y.m. liu a  a cnnc key laboratory on geological disposal of high-level radioactive waste, beijing research institute of uranium geology, beijing 100029, china b school of civil and environmental engineering, university of science and technology beijing, beijing 100083, china  a r t i c l e  i n f o  a b s t r a c t  article history: received 28 december 2015 received in revised form 26 april 2016 accepted 18 june 2016 available online 20 june 2016  keywords: thermal conductivity water saturation temperature axial stress beishan granite geological disposal  crystalline rocks are potential host rock types for the 

In [68]:
#Over-Spacing
text_punew_spdo_ger_ovsp = re.sub(r'(\s)(?=\s+)','',text_punew_spdo_ger)
text_punew_spdo_ger_ovsp

"tectonophysics 683 (2016) 124–137 contents lists available at sciencedirect tectonophysics j o u r n a l h o me p a g e : w w w . e l s e v i e r . c o m / l o c a t e / t e c t o experimental investigations on the thermal conductivity characteristics of beishan granitic rocks for china's hlw disposal x.g. zhao a,⁎, j. wang a, f. chen a,b, p.f. li a, l.k. ma a, j.l. xie a, y.m. liu a a cnnc key laboratory on geological disposal of high-level radioactive waste, beijing research institute of uranium geology, beijing 100029, china b school of civil and environmental engineering, university of science and technology beijing, beijing 100083, china a r t i c l e i n f o a b s t r a c t article history: received 28 december 2015 received in revised form 26 april 2016 accepted 18 june 2016 available online 20 june 2016 keywords: thermal conductivity water saturation temperature axial stress beishan granite geological disposal crystalline rocks are potential host rock types for the constructio

In [69]:
#Revise et al. + lower
text_punew_spdo_ger_ovsp_etal = regex.sub(r'(\w+\s+et al.)','',text_punew_spdo_ger_ovsp)
text_punew_spdo_ger_ovsp_etal

"tectonophysics 683 (2016) 124–137 contents lists available at sciencedirect tectonophysics j o u r n a l h o me p a g e : w w w . e l s e v i e r . c o m / l o c a t e / t e c t o experimental investigations on the thermal conductivity characteristics of beishan granitic rocks for china's hlw disposal x.g. zhao a,⁎, j. wang a, f. chen a,b, p.f. li a, l.k. ma a, j.l. xie a, y.m. liu a a cnnc key laboratory on geological disposal of high-level radioactive waste, beijing research institute of uranium geology, beijing 100029, china b school of civil and environmental engineering, university of science and technology beijing, beijing 100083, china a r t i c l e i n f o a b s t r a c t article history: received 28 december 2015 received in revised form 26 april 2016 accepted 18 june 2016 available online 20 june 2016 keywords: thermal conductivity water saturation temperature axial stress beishan granite geological disposal crystalline rocks are potential host rock types for the constructio

In [70]:
#Eliminate fig and table
text_punew_spdo_ger_ovsp_etal = re.sub(r'ﬁ','fi',text_punew_spdo_ger_ovsp_etal)
text_punew_spdo_ger_ovsp_etal_fig = re.sub(r'(table|tables|figure|figures|fig.?|figs.)\s+(\d+|\d+.)', '', text_punew_spdo_ger_ovsp_etal)
text_punew_spdo_ger_ovsp_etal_fig

"tectonophysics 683 (2016) 124–137 contents lists available at sciencedirect tectonophysics j o u r n a l h o me p a g e : w w w . e l s e v i e r . c o m / l o c a t e / t e c t o experimental investigations on the thermal conductivity characteristics of beishan granitic rocks for china's hlw disposal x.g. zhao a,⁎, j. wang a, f. chen a,b, p.f. li a, l.k. ma a, j.l. xie a, y.m. liu a a cnnc key laboratory on geological disposal of high-level radioactive waste, beijing research institute of uranium geology, beijing 100029, china b school of civil and environmental engineering, university of science and technology beijing, beijing 100083, china a r t i c l e i n f o a b s t r a c t article history: received 28 december 2015 received in revised form 26 april 2016 accepted 18 june 2016 available online 20 june 2016 keywords: thermal conductivity water saturation temperature axial stress beishan granite geological disposal crystalline rocks are potential host rock types for the constructio

In [71]:
#Eliminate email
text_punew_spdo_ger_ovsp_etal_fig_emai = regex.sub(r'\b[\w-.]+?@\w+?.\w+[\w\.\-]+\b','',text_punew_spdo_ger_ovsp_etal_fig)
text_punew_spdo_ger_ovsp_etal_fig_emai

"tectonophysics 683 (2016) 124–137 contents lists available at sciencedirect tectonophysics j o u r n a l h o me p a g e : w w w . e l s e v i e r . c o m / l o c a t e / t e c t o experimental investigations on the thermal conductivity characteristics of beishan granitic rocks for china's hlw disposal x.g. zhao a,⁎, j. wang a, f. chen a,b, p.f. li a, l.k. ma a, j.l. xie a, y.m. liu a a cnnc key laboratory on geological disposal of high-level radioactive waste, beijing research institute of uranium geology, beijing 100029, china b school of civil and environmental engineering, university of science and technology beijing, beijing 100083, china a r t i c l e i n f o a b s t r a c t article history: received 28 december 2015 received in revised form 26 april 2016 accepted 18 june 2016 available online 20 june 2016 keywords: thermal conductivity water saturation temperature axial stress beishan granite geological disposal crystalline rocks are potential host rock types for the constructio

In [72]:
#Eliminate website
text_punew_spdo_ger_ovsp_etal_fig_emai_web = regex.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w\s_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','',text_punew_spdo_ger_ovsp_etal_fig_emai)
text_punew_spdo_ger_ovsp_etal_fig_emai_web

"tectonophysics 683 (2016) 124–137 contents lists available at sciencedirect tectonophysics j o u r n a l h o me p a g e : w w w . e l s e v i e r . c o m / l o c a t e / t e c t o experimental investigations on the thermal conductivity characteristics of beishan granitic rocks for china's hlw disposal x.g. zhao a,⁎, j. wang a, f. chen a,b, p.f. li a, l.k. ma a, j.l. xie a, y.m. liu a a cnnc key laboratory on geological disposal of high-level radioactive waste, beijing research institute of uranium geology, beijing 100029, china b school of civil and environmental engineering, university of science and technology beijing, beijing 100083, china a r t i c l e i n f o a b s t r a c t article history: received 28 december 2015 received in revised form 26 april 2016 accepted 18 june 2016 available online 20 june 2016 keywords: thermal conductivity water saturation temperature axial stress beishan granite geological disposal crystalline rocks are potential host rock types for the constructio

## Pre-Processing 2 : Sentences with Conditioning, Stemmer / Lemmatizer,  & Stopwords

In [73]:
#Splitted into sentences by Spacy
spc_text = spc(text_punew_spdo_ger_ovsp_etal_fig_emai_web)

#Prep []
prepro1 = []
col_intro = []
stop_ack_stc = []
stop_ref_stc = []
name = []
pca = '!"#$%&\*+/:;<=>?@[\]^_`{|}~'
pattern_comb = regex.compile(r'([a-z]+[\d]+|\d+[a-z]+|\d+[{}]+|[a-z]+[{}]+|[{}]+[a-z]+|[{}]+[\d]+)'.format(pca,pca,pca,pca))

#list all name
for i in spc_text.ents:
    if i.label_.lower() == 'person':
        name.append(i.text)
    else:
        pass

#List of Sentences with lemmatization
for i,j in enumerate(spc_text.sents):
    
    #Eliminate word - number - punc-chars combination
    sent_lem = regex.sub(pattern_comb,'',j.lemma_)
    
    #delete name
    if len(name) != 0:
        try:
            sent_lem = regex.sub(r",*(\s*\b(?:{}))\b".format("|".join(name)),'',sent_lem)
        except:
            pass
    else:
        pass
    
    prepro1.append(sent_lem)
    
    #Serach for the start pf acknowledgements and references
    if bool(re.search('(acknowledgments|acknowledgment)',sent_lem)) == True:
        stop_ack_stc.append(i)
        
    if bool(re.search('(references|reference)',sent_lem)) == True:
        stop_ref_stc.append(i)
        
    elif bool(re.search('(introduction)',sent_lem)) == True:
        col_intro.append(i)

#Cut parts before Introduction and after the acknowledgments or references
try:
    if len(stop_ack_stc) == 0:
        try:
            prepro1 = prepro1[col_intro[0]:stop_ref_stc[-1]]
        except:
            prepro1 = prepro1[:stop_ref_stc[-1]]
    else:
        try:
            prepro1 = prepro1[col_intro[0]:stop_ack_stc[-1]]
        except:
            prepro1 = prepro1[:stop_ack_stc[-1]]
except:
    pass
    
#All results
brute_result = prepro1

In [74]:
prepro1

['introduction deep geological disposal have be the internationally accept approach for the permanent disposal of high level radioactive waste (hlw) generate from nuclear power plant and other nuclear facility.',
 ' hlw repository can be construct inhost rockdepth of several hundred meter below the ground surface.',
 'the design of hlw repository often rely onmulti-barrier system, which typically consist of the natural geological barrier and engineer barrier system.',
 ' the last defense to the biosphere, the natural geological barrier (i.e., the host rock) playcritical role in ensure the long-term safety of the hlw repository.',
 'because crystalline rock such granite and diorite have low permeability, high solidity, and good excavation stability, they have be consider potential hlw repository formation in some country (,  wang, 2014).',
 "site selection for china's hlw repository start in 1985 (wang, 2010).",
 'the effort have be focus on potential hlw repository site locate within g

In [82]:
#Read the file Stopwords
gist_file = open("gist_stopwords.txt", "r")
try:
    content = gist_file.read()
    stopwords = content.split(",")
finally:
    gist_file.close()

prepro2=[]

#Pattern for Remove Citation
pattern1 = regex.compile(r'\(([\w\s\d{}]+)\)'.format(pc))

#Pattern for Apply the Stopwords
pattern2 = r",*(\s*\b(?:{}))\b".format("|".join(stopwords))

#Pattern for unused space first n last
pattern3 = regex.compile(r'((^\s+)(?=.)|(?<=.)(\s+$))')

#Pattern for Apply over-space
pattern4 = regex.compile(r'(\s)(?=\s+)')

#Pattern Additional compiler
pattern_add = regex.compile(r'(\x01|\x0c|\s[a-z]\s|°|^\b(\w\s)|I|)')

#Pattern for combined word and number
for i,j in enumerate(prepro1):
    if bool(regex.search(pattern1,j)) == True:
        if bool(regex.search(r'\d',regex.search(pattern1,j).group())) == True:
            try:
                layer = regex.findall(pattern1,j)
                if type(layer) == tuple:
                    layer = list(filter(None, [i for i in layer]))
                    prepro2.append(re.sub(layer[0],'',j))
                else:
                    prepro2.append(re.sub(regex.findall(pattern1,j)[0],'',j))
            except:
                prepro2.append(j)
        else:
            prepro2.append(j)
    else:
        prepro2.append(j)
    
    #Weird character of ﬂ
    layer = regex.sub(r'ﬂ','fl',prepro2[i])
    
    #With Removing number and punctuations    
    layer = regex.sub(r'(\d|[^A-Za-zöäüéíáúóðèñæýßôþ\s]|[^\P{P}]+)',' ',re.sub(pattern2,'', layer)) 
    
    #With Removing unused space first n las
    layer = regex.sub(pattern3,'',layer)
    
    #With Removing over-space
    layer = regex.sub(pattern4,'',layer)
    
    #With Removing Additional compiler
    prepro2[i] = regex.sub(pattern_add,'',layer)

#Final filtering for less than three words sentence
d = lambda y : None if len(y.split(' '))<=4 else y
prepro2 = list(filter(d, prepro2))

In [83]:
prepro2

['introduction deep geological disposal internationally accept approach permanent disposal high level radioactive waste hlw generate nuclear power plant nuclear facility',
 'hlw repository construct inhost rockdepth meter ground surface',
 'design hlw repository rely onmulti barrier typically consist natural geological barrier engineer barrier',
 'defense biosphere natural geological barrier host rock playcritical role ensure long term safety hlw repository',
 'crystalline rock granite diorite low permeability high solidity good excavation stability potential hlw repository formation country',
 'site selection china hlw repository start',
 'effort focus potential hlw repository site locate granite intrusion mainland china',
 'institute uranium geology briug perform site characterization study beishan china',
 'beishan potential candidate china hlw repository',
 'engineering property crystalline rockpotential hlw repository site thermal conductivity parameter design consideration havedi

In [90]:
regex.sub(r'(\d|[^A-Za-zöäüéíáúóðèñæýßôþ\sð]|[^\P{P}]+)',' ','work require explain discrepancy define water saturation thermal conductivity function porosity rock express ksat kdry ð þ') 

'                                                                                                                          '

## NER Library

In [77]:
#Auto Read entities
dict_entities={}
path='C:/Users/Ryan Bobby A/Documents/RWTH AACHEN/KULIAH/THESIS_BISMILLAH/Thesis Progress/list_entities/'
for filename in glob.glob(os.path.join(path, '*.txt')):
    gist_file = open(filename, "r")
    try:
        content = gist_file.read().lower()
        dict_entities[re.search(r'(\w+).txt$',filename).group(1)] = list(filter(None, content.split("\n")))
    finally:
        gist_file.close()

In [78]:
#Ready for NLP NER
ready_ner = {'text':[],'tag':[],'entity':[]}
for i in prepro2:
    for j in dict_entities:
        pattern = r"(\.*\b(?:{}))\b".format("|".join(dict_entities[j]))
        if bool(re.search(pattern, i)) == True:
            for k in re.findall(pattern,i):
                if type(k) == tuple:
                    k = list(filter(None,[i for i in k]))
                    for m in k:
                        ready_ner['text'].append(i)
                        ready_ner['tag'].append(m)
                        ready_ner['entity'].append(re.search(r'list_(\w+)$',j).group(1))
                else:
                    ready_ner['text'].append(i)
                    ready_ner['tag'].append(k)
                    ready_ner['entity'].append(re.search(r'list_(\w+)$',j).group(1))
        else:
            pass

In [79]:
ready_ner['text']

['introduction deep geological disposal internationally accept approach permanent disposal high level radioactive waste hlw generate nuclear power plant nuclear facility',
 'crystalline rock granite diorite low permeability high solidity good excavation stability potential hlw repository formation country',
 'crystalline rock granite diorite low permeability high solidity good excavation stability potential hlw repository formation country',
 'crystalline rock granite diorite low permeability high solidity good excavation stability potential hlw repository formation country',
 'crystalline rock granite diorite low permeability high solidity good excavation stability potential hlw repository formation country',
 'crystalline rock granite diorite low permeability high solidity good excavation stability potential hlw repository formation country',
 'site selection china hlw repository start',
 'effort focus potential hlw repository site locate granite intrusion mainland china',
 'effort f

In [80]:
ready_ner['tag']

['high',
 'permeability',
 'formation',
 'granite',
 'diorite',
 'high',
 'china',
 'china',
 'granite',
 'china',
 'china',
 'thermal conductivity',
 'copper',
 'iron',
 'heat',
 'temperature',
 'temperature',
 'deposit',
 'temperature',
 'thermal conductivity',
 'lead',
 'thermal conductivity',
 'lead',
 'high',
 'thermal conductivity',
 'temperature',
 'high',
 'thermal conductivity',
 'thermal conductivity',
 'thermal conductivity',
 'porosity',
 'density',
 'mineral',
 'composition',
 'porosity',
 'thermal conductivity',
 'mineral',
 'composition',
 'porosity',
 'thermal conductivity',
 'volcanic',
 'sedimentary',
 'high',
 'porosity',
 'heat',
 'velocity',
 'thermal conductivity',
 'temperature',
 'thermal conductivity',
 'thermal conductivity',
 'temperature',
 'saturation',
 'thermal conductivity',
 'porosity',
 'thermal conductivity',
 'stress',
 'pressure',
 'thermal conductivity',
 'thermal conductivity',
 'pressure',
 'pressure',
 'thermal conductivity',
 'rate',
 'pore',
 

In [81]:
ready_ner['entity']

['petrography',
 'methods',
 'methods',
 'petrography',
 'petrography',
 'petrography',
 'countries',
 'countries',
 'petrography',
 'countries',
 'countries',
 'methods',
 'petrography',
 'petrography',
 'methods',
 'methods',
 'methods',
 'petrography',
 'methods',
 'methods',
 'geolocation',
 'methods',
 'petrography',
 'petrography',
 'methods',
 'methods',
 'petrography',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'petrography',
 'petrography',
 'methods',
 'methods',
 'petrography',
 'petrography',
 'methods',
 'methods',
 'petrography',
 'petrography',
 'petrography',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'methods',
 'petrography',
 'petrography',
 'geolocation',
 'me