In [49]:
#PDF Miner (PDF Scraping)
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import HTMLConverter, TextConverter, XMLConverter
from io import StringIO
from pdfminer.pdfpage import PDFPage

#NLP Preprocessing
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


#NLP Processing TensorFLow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

#NLP Processing Spacy
import spacy
spc = spacy.load("en_core_web_lg")
spc.add_pipe('sentencizer')

#Model Training and Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#Basic
import numpy as np
import pandas as pd

## P3 Data (Stratigraphy)

In [242]:
df = pd.read_excel('C:/Users/Ryan Bobby A/Documents/RWTH AACHEN/KULIAH/THESIS_BISMILLAH/Thesis Progress/Target Data/Stratigraphy_P3.xlsx')

In [243]:
df.head()

Unnamed: 0,Stratigraphic ID,Stratigraphic parent ID,Eonothem / Eon ID,Erathem / Era ID,Period ID,System ID,Series / Epoch ID,Stage / Age ID,Eonothem / Eon,Erathem / Era,...,Series / Epoch,Stage / Age,Numerical_Age Lower Boundary [Ma],Num_Age Plus_Minus [Ma],Chronostratigraphical Unit,Stratigraphic ID.1,Rank,R,G,B
0,1.0,,1.0,,,,,,Precambrium,,...,,,4600.0,,Precambrium,1.0,1.0,239.0,103.0,130.0
1,2.0,1.0,1.0,2.0,,,,,Precambrium,Hadean,...,,,4600.0,,Hadean,2.0,2.0,178.0,20.0,141.0
2,3.0,1.0,1.0,3.0,,,,,Precambrium,Archean,...,,,4000.0,,Archean,3.0,2.0,234.0,1.0,140.0
3,4.0,3.0,1.0,3.0,4.0,,,,Precambrium,Archean,...,,,4000.0,,Eoarchean,4.0,3.0,224.0,58.0,149.0
4,5.0,3.0,1.0,3.0,5.0,,,,Precambrium,Archean,...,,,3600.0,,Paleoarchean,5.0,3.0,240.0,132.0,182.0


In [244]:
df_chrono = df[['Eonothem / Eon', 'Erathem / Era','Chronostratigraphical Unit']]
df_chrono.head()

Unnamed: 0,Eonothem / Eon,Erathem / Era,Chronostratigraphical Unit
0,Precambrium,,Precambrium
1,Precambrium,Hadean,Hadean
2,Precambrium,Archean,Archean
3,Precambrium,Archean,Eoarchean
4,Precambrium,Archean,Paleoarchean


In [245]:
list_chrono = df['Chronostratigraphical Unit'].values.tolist()
temp_eon = df['Eonothem / Eon'].values.tolist()
temp_era = df['Erathem / Era'].values.tolist()

In [246]:
list_chrono

['Precambrium',
 'Hadean',
 'Archean',
 'Eoarchean',
 'Paleoarchean',
 'Mesoarchean',
 'Neoarchean',
 'Proterozoic',
 'Paleoproterozoic',
 'Siderian',
 'Rhyacian',
 'Orosirian',
 'Satherian',
 'Mesoproterozoic',
 'Calymmian',
 'Ectasian',
 'Stenian',
 'Neoproterozoic',
 'Tonian',
 'Cryogenian',
 'Ediacaran',
 'Phanerozoic',
 'Paleozoic',
 'Cambrian',
 'Terreneuvian',
 'Fortunian',
 'Cambrian Stage 2',
 'Cambrian Series 2',
 'Cambrian Stage 3',
 'Cambrian Stage 4',
 'Cambrian Series 3',
 'Cambrian Stage 5',
 'Drumian',
 'Guzhangian',
 'Furongian',
 'Paibian',
 'Jingshanian',
 'Stage 10',
 'Ordovician',
 'Lower Ordovician',
 'Tremadocian',
 'Floian',
 'Middle Ordovician',
 'Dapingian',
 'Darriwilian',
 'Upper Ordovician',
 'Sandbian',
 'Katian',
 'Hirnantian',
 'Silurian',
 'Llandovery',
 'Rhuddanian',
 'Aeronian',
 'Telychian',
 'Wenlock',
 'Sheinwoodian',
 'Homerian',
 'Ludlow',
 'Gorstian',
 'Ludfordian',
 'Pridoli',
 'Devonian',
 'Lower Devonian',
 'Lochkovian',
 'Pragian',
 'Emsian'

In [790]:
with open ('list_chrono.txt', 'w') as f:
    for item in list_chrono:
        f.write(str(item) + '\n')

In [586]:
list_eon = []
for i in temp_eon:
    if i in list_eon:
        pass
    else:
        list_eon.append(i)
add = ['Archaean','Proterozoic']
for i in add:
    list_eon.append(i)
list_eon.remove(np.nan)
list_eon

['Precambrium', 'Phanerozoic', 'Archaean', 'Proterozoic']

In [791]:
with open ('list_eon.txt', 'w') as f:
    for item in list_eon:
        f.write(str(item) + '\n')

In [587]:
list_era = []
for i in temp_era:
    if i in list_era:
        pass
    else:
        list_era.append(i)
list_era.remove(np.nan)
list_era

['Hadean', 'Archean', 'Proterozoic', 'Paleozoic', 'Mesozoic', 'Cenozoic']

In [792]:
with open ('list_era.txt', 'w') as f:
    for item in list_era:
        f.write(str(item) + '\n')

## P3 Data (Petrography)

In [769]:
df = pd.read_excel('C:/Users/Ryan Bobby A/Documents/RWTH AACHEN/KULIAH/THESIS_BISMILLAH/Thesis Progress/Target Data/Petrography_P3.xlsx')

In [770]:
df.head()

Unnamed: 0,Petrographic ID,Petrographic parental ID,1,2,3,4,5,6,7,8,...,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Petrographic term.1,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34
0,,,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,...,,,,,,,,,,m magma
1,10101.0,,10101.0,,,,,,,,...,,,,,,,,,,
2,10102.0,,10102.0,,,,,,,,...,,,,,Consolidated rock,,,,,
3,10104.0,10102.0,10102.0,10104.0,,,,,,,...,American Geological Institute (1987),,,,Magmatic rock,,,,,
4,10105.0,10104.0,10102.0,10104.0,10105.0,,,,,,...,IUGS (1989),,,,Plutonic rock,,,,,


In [771]:
list_petro = df['Petrographic term.1'].values.tolist()
list_petro = [i for i in list_petro if type(i) == str ]

In [772]:
for i,j in enumerate(list_petro):
    if bool(re.search(r' rock',j)) == True:
        list_petro[i] = re.sub(r' rock','',j)
list_petro

['Consolidated',
 'Magmatic',
 'Plutonic',
 'Plutonic, modal, field description',
 'Very- Quartz rich plutonic',
 'Quartz- rich granitic plutonic',
 'Quartz- rich plutonic',
 'Granitic plutonic',
 'Quartz- bearing to Foid- bearing plutonic',
 'Syenitic plutonic',
 'Dioritic plutonic',
 'Gabbrotic plutonic',
 'Foid- rich plutonic',
 'Foid- syenitic plutonic',
 'Foid- dioritic plutonic',
 'Foid- gabbrodic plutonic',
 'Very- Foid- rich plutonic/ Foidolite',
 'Foidolitic plutonic',
 'Ultra- mafic plutonic (field classification)',
 'Plutonic, modal (QAPF)',
 'Quartzolite (QAPF)',
 'Granite (QAPF)',
 'Alkali- Feldspar- Granite',
 'Syenogranite',
 'Monzogranite',
 'Granodiorite (QAPF)',
 'Tonalite (QAPF)',
 'Quartz-Alkali-Feldspar-Syenite (QAPF)',
 'Quartz-Syenite (QAPF)',
 'Quartz-Monzonite (QAPF)',
 'Quartz-Monzodiorite (QAPF)',
 'Quartz-Monzogabbro (QAPF)',
 'Quartz-Diorite (QAPF)',
 'Quartz-Gabbro (QAPF)',
 'Quartz-Anorthosite (QAPF)',
 'Alkali-Feldspar-Syenite (QAPF)',
 'Syenite (QAPF)',

In [778]:
for i in list_petro:
    print(i)

Consolidated
Magmatic
Plutonic
Plutonic, modal, field description
Very- Quartz rich plutonic
Quartz- rich granitic plutonic
Quartz- rich plutonic
Granitic plutonic
Quartz- bearing to Foid- bearing plutonic
Syenitic plutonic
Dioritic plutonic
Gabbrotic plutonic
Foid- rich plutonic
Foid- syenitic plutonic
Foid- dioritic plutonic
Foid- gabbrodic plutonic
Very- Foid- rich plutonic/ Foidolite
Foidolitic plutonic
Ultra- mafic plutonic (field classification)
Plutonic, modal (QAPF)
Quartzolite (QAPF)
Granite (QAPF)
Alkali- Feldspar- Granite
Syenogranite
Monzogranite
Granodiorite (QAPF)
Tonalite (QAPF)
Quartz-Alkali-Feldspar-Syenite (QAPF)
Quartz-Syenite (QAPF)
Quartz-Monzonite (QAPF)
Quartz-Monzodiorite (QAPF)
Quartz-Monzogabbro (QAPF)
Quartz-Diorite (QAPF)
Quartz-Gabbro (QAPF)
Quartz-Anorthosite (QAPF)
Alkali-Feldspar-Syenite (QAPF)
Syenite (QAPF)
Monzonite (QAPF)
Monzodiorite (QAPF)
Monzogabbro (QAPF)
Diorite (QAPF)
Gabbro (QAPF)
Gabbronorite
Norite
Troctolite
Olivine- Gabbro
Olivine- Gabbro

ash
block
tuff
lapilli
tuff
ash
tuff
air
fall
ash
flow
surge
ash
fine
ash
coarse
ash
crystal
ash
lapilli
ash
bomb
ash
block
ash
lapilli
ash
lapilli
bomb
lapilli
block
lapilli
bomb
lapilli
bomb
ash
bomb
block
lapilli
block
ash
block
tuff
lapilli
tuff
ash
tuff
air
fall
ash
flow
surge
ash
fine
ash

ash
crystal
ash
lapilli
ash
bomb
ash
block
ash
lapilli
ash
lapilli
bomb
lapilli
block
lapilli
bomb
lapilli
bomb
ash
bomb
block
lapilli
block
ash
block
tuff
lapilli
tuff
ash
tuff
air
flow
ash
flow
surge
ash
fine
ash
coarse
ash
crystal
ash
lapilli
ash
bomb
ash
block
ash
lapilli
ash
lapilli
bomb
lapilli
block
lapilli
bomb
lapilli
bomb
ash
bomb
block
lapilli
block
ash
block
tuff
lapilli
tuff
ash
tuff
air
flow
ash
flow
surge
ash
fine
ash
coarse
ah
ash
lapilli
ash
bomb
ash
block
ash
lapilli
lapilli
ash
bomb
lapilli
block
lapilli
bomb
lapilli
bomb
ash
bomb
block
lapilli
block
ash
block
tuff
lapilli
tuff
ash
tuff
air
fall
ash
flow
surge
ash
fine
ash
coarse
ash
ash
lapilli
ash
bomb
ash
block
ash
lapilli

In [786]:
for i in list_petro:
    for j in i.split():
        if j.lower() not in list_petro:
            list_petro.append(re.sub(r'-','',j.lower()))
        else:
            pass

list_petro_det=[]

for i in list_petro:
    if re.sub(r'[/,"0-9)(]','',i.lower()) not in list_petro_det:
        list_petro_det.append(re.sub(r'[/,"0-9)(]','',i.lower()))
    else:
        pass
    
remove = ["bora",'ah', 'capped','after','educt','chemistry','fabric','content','pillow','black','ribbon','alteration','composite', 'dunham',
          'modal', 'field', 'description', 'very' ,'rich', 'bearing', 'to' ,'foid', 'eg.' ,'chemically' ,'tas', 'k' ,'medium', 'low' ,
          'or', 'slightly', 'fine', 'air', 'fall' ,'flow', ' ' ,'top' ,'bottom' ,'and' ,'of' ,'free', 'milky', 'sorted' ,'two' ,'&' ,
          'embry', 'klovan', 'supported' ,'fresh', 'water', 'mixed', 'humic', 'horizon', 'ortho', 'pitch', 'flaming' ,'gas' ,
          'cooking', 'steam' ,'lean' ,'semi', 'artificial', 'fen' ,'stronly', 'blocky', 'strong' ,'pure' ,'normal', 'detrital' ,
          'moistured', 'dry', 'layering', 'strongly','allothonous', 'folk','fossile','dull','bright','oil','non','decomposed','lowland','upland']
for i in remove:
    try:
        list_petro_det.remove(i)
    except:
        pass
    
for i,j in enumerate(list_petros):
    list_petro_det[i] = re.sub(r'(- |-|qapf|plutonic affinity|field classification|tas|modal|modal/normativ|chemically|epiclastic)',' ', j)

In [787]:
list_petro_det = list(filter(lambda a: a != ' ', list_petro_det))

In [788]:
list_petro_det

['consolidated',
 'magmatic',
 'plutonic',
 'plutonic   field description',
 'very quartz rich plutonic',
 'quartz rich granitic plutonic',
 'quartz rich plutonic',
 'granitic plutonic',
 'quartz bearing to foid bearing plutonic',
 'syenitic plutonic',
 'dioritic plutonic',
 'gabbrotic plutonic',
 'foid rich plutonic',
 'foid syenitic plutonic',
 'foid dioritic plutonic',
 'foid gabbrodic plutonic',
 'very foid rich plutonic foidolite',
 'foidolitic plutonic',
 'ultra mafic plutonic  ',
 'plutonic    ',
 'quartzolite  ',
 'granite  ',
 'alkali feldspar granite',
 'syenogranite',
 'monzogranite',
 'granodiorite  ',
 'tonalite  ',
 'quartz alkali feldspar syenite  ',
 'quartz syenite  ',
 'quartz monzonite  ',
 'quartz monzodiorite  ',
 'quartz monzogabbro  ',
 'quartz diorite  ',
 'quartz gabbro  ',
 'quartz anorthosite  ',
 'alkali feldspar syenite  ',
 'syenite  ',
 'monzonite  ',
 'monzodiorite  ',
 'monzogabbro  ',
 'diorite  ',
 'gabbro  ',
 'gabbronorite',
 'norite',
 'troctolite'

In [793]:
with open ('list_petrography.txt', 'w') as f:
    for item in list_petro_det:
        f.write(str(item) + '\n')

## P3 Data (Method)

In [557]:
df = pd.read_excel('C:/Users/Ryan Bobby A/Documents/RWTH AACHEN/KULIAH/THESIS_BISMILLAH/Thesis Progress/Target Data/Method_P3.xlsx')

In [558]:
df.head()

Unnamed: 0.1,Unnamed: 0,Reference,Unnamed: 2,Unnamed: 3,Sampling Point,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 277,Unnamed: 278,Unnamed: 279,Quality indice Measurement Parameter,Unnamed: 281,Unnamed: 282,Unnamed: 283,Unnamed: 284,Value Quality index,Class Quality index
0,Sample ID,Primary Reference (Bibtexkey),Secondary Reference,Date of Input,Location Type,Original Location Type,Location name,Country,State/Region,Name Shapefile,...,help indice for saturation,indice for measurement conditions,Indice Measurement conditions,Indice Measurement parameter,help indice values,N = Single (0) or Mean (1),help indice values 1 (SD),help indice values 2 (MinMax),Quality index values,Quality index


In [559]:
list_method = df.loc[0,:].values.tolist()
list_method

['Sample ID',
 'Primary Reference (Bibtexkey)',
 'Secondary Reference',
 'Date of Input',
 'Location Type',
 'Original Location Type',
 'Location name',
 'Country',
 'State/Region',
 'Name Shapefile',
 'Location longitude (decimaldegree.minutes)',
 'Location latitude (decimaldegree.minutes)',
 'Location elevation Z (m a.s.l.)',
 'Radius of Uncertainty (km)',
 'Sample type',
 'Original sample ID',
 'International Geo Sample Number (IGSN)',
 'Sample length (m)',
 'Sample height (m)',
 'Sample width (m)',
 'Sample diameter (for cores)\n(m)',
 ' Sample longitude (decimaldegrees.minutes)',
 ' Sample latitude (decimaldegrees.minutes)',
 ' Sample depth Z (MD) (m below location elevation)',
 'Sample depth Z (TVD) (m below location elevation)',
 'Rock Classification ID',
 'Rock Classification Parent ID',
 'Rock Classification ID 2nd Level',
 'Rock Classification ID Rank',
 'Petrographic term',
 'Petrography (in detail, original)',
 'Petrographic methods',
 'Sample texture',
 'Homogeneous / Inho

In [560]:
list_methods = []
for i in list_method:
    new_text = re.sub(r'[]\n[]', ' ', i)
    for j in new_text.split():
        if re.sub(r'[)(]', '', j).lower() not in list_methods:
            list_methods.append(re.sub(r'[)(]', '', j).lower())
        else:
            pass
list_methods = [i for i in list_methods if len(i) !=1]

In [570]:
for i in df.columns.values.tolist():
    if bool(re.search(r'Unnamed: \d+',i)) != True:
        list_methods.append(i)

In [797]:
list_methods_det = ['longitude',
 'decimaldegree.minutes',
 'latitude',
 'elevation',
 'a.s.l.',
 'radius',
 'uncertainty',
 'km',
 'igsn',
 'length',
 'height',
 'width',
 'diameter',
 'decimaldegrees.minutes',
 'depth',
 'md',
 'tvd',
 'petrography',
 'stratigraphy',
 'temperature',
 'pressure',
 'pa',
 'saturating',
 'saturation',
 'sigma1',
 'mpa',
 'sigma2',
 'sigma3',
 'pore',
 'strain',
 'rate',
 'kn/s',
 'mpa/s',
 'mm/s',
 '1/s',
 'value',
 'grain',
 'density',
 'kg/m³',
 'standard deviation',
 'min',
 'max',
 'porosity',
 'permeability',
 'm²',
 'thermal',
 'conductivity',
 'w/m·k',
 'inhomogenity',
 'specific heat capacity',
 'heat',
 'capacity',
 'j/kg·k',
 'volumetric',
 'j/m³·k',
 'diffusivity',
 'm²/s',
 'radiogenic',
 'w/m³',
 'p-wave',
 'velocity',
 'm/s',
 'frequency',
 'khz',
 's-wave',
 'dyn',
 'youngs modulus',
 'modulus',
 'gpa',
 'stat',
 'shear',
 'lame',
 'cohesion',
 'friction',
 'coefficient',
 'confining',
 'porefluid',
 'differential',
 'stress',
 'mean',
 'compactive',
 'yield',
 'critical',
 'onset',
 'inelastic',
 'hydrostatic',
 'compaction',
 'shear-enhanced',
 'enhanced',
 'poisson',
 'ratio',
 'dynamic',
 'static',
 'ucs',
 'tensile',
 'strength',
 's/m',
 'formation',
 'resistivity',
 'chronostratigraphy',
 'sampling point',
 'petrography',
 'stratigraphy',
 'grain density',
 'bulk density',
 'total porosity',
 'effective porosity',
 'apparent permeability',
 'intrinsic permeability',
 'bulk thermal conductivity',
 'matrix thermal conductivity',
 'specific heat capacity',
 'volumetric heat capacity',
 'thermal diffusivity',
 'radiogenic heat production',
 'p-wave velocity',
 's-wave velocity',
 'youngs modulus',
 'shear modulus',
 'bulk modulus',
 'lamé modulus',
 'cohesion',
 'friction coefficient',
 'triaxial experiments',
 'poisson ratio',
 'uniaxial compressive strength',
 'tensile strength',
 'electrical conductivity',
 'magnetic susceptibility',
 'quality indice geographic uncertatiny',
 'qualitiy indice petrography',
 'quality indice stratigraphy',
 'qualitiy indice measurement conditions',
 'quality indice measurement parameter',
 'value quality index',
 'class quality index'
]

In [798]:
with open ('list_methods.txt', 'w') as f:
    for item in list_methods_det:
        f.write(str(item) + '\n')

## P3 Data (Geolocation)

### Geolocation

In [740]:
df = pd.read_excel('C:/Users/Ryan Bobby A/Documents/RWTH AACHEN/KULIAH/THESIS_BISMILLAH/Thesis Progress/Target Data/Geolocation_P3.xlsx')

In [741]:
df.head()

Unnamed: 0,Location name
0,
1,
2,
3,
4,


In [742]:
temp_loc= df['Location name'].values.tolist()
temp_loc= [i.lower() for i in list_loc if type(i) == str ]

list_geoloc = []
for i in list_loc:
    if i in list_geoloc:
        pass
    else:
        list_geoloc.append(i)

list_geolocation = []
list_geolocation_det = []

#Conditioning
for i in list_geoloc:
    for j in i.split(sep='/'):
        if j.strip() not in list_geolocation:
            list_geolocation.append(j.strip())

#Conditioning
for i in list_geolocation:
    for j in re.split(', |,|/', i):
        if re.sub(r' \d+','',j.strip()) not in list_geolocation_det and len(re.search(r'\w+', j).group()) !=1:
            list_geolocation_det.append(re.sub('( \d+\w?| (ca. km))','',j.strip()))

#Conditioning over-spaced
for j,i in enumerate(list_geolocation_det):
    try:
        if len(re.search(' +',i).group()) > 1:
            list_geolocation_det[j] = ' '.join(i.split())
    except:
        pass
    
#Conditioning Core Well Data
for j, i in enumerate(list_geolocation_det):
    list_geolocation_det[j] = re.sub('\d+ m \w+´ ','',i)

In [743]:
list_geolocation_det

['kraka massif',
 'southern urals',
 'kuznetsk alatau',
 'berikul region',
 'klyuchovsky volcano',
 'kamchatka',
 'rockport',
 'belokurikhinsky massif',
 'altai',
 'val verde',
 'porterville',
 'frederick',
 'maryland',
 'westerly',
 'westfield',
 'peninsula station',
 'ont.',
 'vinal haven',
 'maine',
 'barre',
 'farm doornspruit',
 'sylmar',
 'french creek',
 'stillwater igneous complex',
 'new glasgow',
 'mellen',
 'pilansberg',
 'balsam gap',
 'allentown',
 'bethlehem',
 'nazareth',
 'pennsylvania',
 'pelham',
 'proctor',
 'solnhofen',
 'offshore louisiana',
 'texas',
 'kyushu',
 'mie',
 'aichi',
 'kyoto',
 'oklahoma',
 'fukushima',
 'colorado',
 'iwate',
 'utah',
 'wyoming',
 'hokkaido',
 'oregon',
 'twin sisters mountain',
 'madneuli',
 'shaori reservoir',
 'akhalkalaki highlands',
 'karatsu',
 'ichinomegata',
 'horoman',
 'chiatura',
 'central asia',
 'dzirula massif',
 'jraber',
 'upper rhine graben',
 'southern part of upper rhine graben',
 'upper rhine graben between black fo

In [745]:
list_geoloc_det = []
with open ('ryan_.txt', 'r') as f:
    for item in f:
        list_geoloc_det.append(f.read())

In [746]:
for i in list_geoloc_det:
    print(i)

southern urals
kuznetsk alatau
berikul region
klyuchovsky volcano
kamchatka
rockport
belokurikhinsky massif
altai
val verde
porterville
frederick
maryland
westerly
westfield
peninsula station
ont.
vinal haven
maine
barre
farm doornspruit
sylmar
french creek
stillwater igneous complex
new glasgow
mellen
pilansberg
balsam gap
allentown
bethlehem
nazareth
pennsylvania
pelham
proctor
solnhofen
offshore louisiana
texas
kyushu
mie
aichi
kyoto
oklahoma
fukushima
colorado
iwate
utah
wyoming
hokkaido
oregon
twin sisters mountain
madneuli
shaori reservoir
akhalkalaki highlands
karatsu
ichinomegata
horoman
chiatura
central asia
dzirula massif
jraber
upper rhine graben
black forest
vosges
upper rhine graben
koytas
kandygatay mts.
kalbinsky massif
shindinsky massif
southwest german molasse basin
south arne
north sea
near freiburg
black forest
near offenburg
tyger-tysh ridge
ulen-tuimsky massif
uybatsky massif
karymsky volcano
kamtchatka
mid-atlantic ocean ridge
hawaii
cerro prieto
mexicali
ohio
ken

In [799]:
with open ('list_geolocation.txt', 'w') as f:
    for item in list_geoloc_det:
        f.write(str(item) + '\n')

### Countries

In [767]:
list_countries_det = ['Afghanistan', 'Aland Islands', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia, Plurinational State of', 'Bonaire, Sint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island', 'Cocos (Keeling) Islands', 'Colombia', 'Comoros', 'Congo', 'Congo, The Democratic Republic of the', 'Cook Islands', 'Costa Rica', "Côte d'Ivoire", 'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Falkland Islands (Malvinas)', 'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana', 'French Polynesia', 'French Southern Territories', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Greece', 'Greenland', 'Grenada', 'Guadeloupe', 'Guam', 'Guatemala', 'Guernsey', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Heard Island and McDonald Islands', 'Holy See (Vatican City State)', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran, Islamic Republic of', 'Iraq', 'Ireland', 'Isle of Man', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jersey', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', "Korea, Democratic People's Republic of", 'Korea, Republic of', 'Kuwait', 'Kyrgyzstan', "Lao People's Democratic Republic", 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macao', 'Macedonia, Republic of', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Martinique', 'Mauritania', 'Mauritius', 'Mayotte', 'Mexico', 'Micronesia, Federated States of', 'Moldova, Republic of', 'Monaco', 'Mongolia', 'Montenegro', 'Montserrat', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Caledonia', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestinian Territory, Occupied', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Pitcairn', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Réunion', 'Romania', 'Russian Federation', 'Rwanda', 'Saint Barthélemy', 'Saint Helena, Ascension and Tristan da Cunha', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Martin (French part)', 'Saint Pierre and Miquelon', 'Saint Vincent and the Grenadines', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Georgia and the South Sandwich Islands', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'South Sudan', 'Svalbard and Jan Mayen', 'Swaziland', 'Sweden', 'Switzerland', 'Syrian Arab Republic', 'Taiwan, Province of China', 'Tajikistan', 'Tanzania, United Republic of', 'Thailand', 'Timor-Leste', 'Togo', 'Tokelau', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Turks and Caicos Islands', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'United States Minor Outlying Islands', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela, Bolivarian Republic of', 'Viet Nam', 'Virgin Islands, British', 'Virgin Islands, U.S.', 'Wallis and Futuna', 'Yemen', 'Zambia', 'Zimbabwe']

In [800]:
with open ('list_countries.txt', 'w') as f:
    for item in list_countries_det:
        f.write(str(item) + '\n')

### State

In [748]:
df = pd.read_excel('C:/Users/Ryan Bobby A/Documents/RWTH AACHEN/KULIAH/THESIS_BISMILLAH/Thesis Progress/Target Data/State_P3.xlsx')

In [765]:
list_state = []
for i in df.values.tolist():
    if i[0] not in list_state:
        list_state.append(i[0])
    else:
        pass

list_state = list_state[0:list_state.index('BR')]
list_state = [str(i).lower() for i in list_state]

In [766]:
list_state_det = ['massachusetts',
 'california',
 'rhode island',
 'vermont',
 'transvaal',
 'pennsylvania',
 'montana',
 'quebec',
 'wisconsin',
 'north carolina',
 'bavaria',
 'louisiana',
 'texas',
 'oklahoma',
 'colorado',
 'utah',
 'wyoming',
 'oregon',
 'washington',
 'kyushu',
 'akita',
 'hokkaido',
 'atlantic ocean',
 'hawaii',
 'java',
 'brandenburg',
 'schleswig-holstein',
 'western namibia',
 'southeastern australia',
 'western south africa',
 'bahia',
 'michigan',
 'new hampshire',
 'iberian peninsula',
 'queensland',
 'ontario',
 'corsica',
 'north taiwan',
 'central',
 'ávila',
 'fars',
 'inyo county',
 'chhattisgarh',
 'scotland',
 'northern territory',
 'central zimbabwe',
 'idaho',
 'singida',
 'southern india',
 'otjozondjupa',
 'central zambia',
 'khémisset',
 'dawson strait',
 'baden-württemberg',
 'manitoba',
 'karnataka',
 'jalisco',
 'east greenland',
 'sierra nevada',
 'bretagne',
 'southern france',
 'central northern nepal',
 'karas',
 'jharkand',
 'alaska',
 'southern finland',
 'eastern south africa',
 'scottland',
 'andhra pradesh',
 'pyrenees',
 'western australia',
 'central south africa',
 'bodrum',
 'parinacota',
 'antofagasta',
 'northeastern spain',
 'moresby strait',
 'central eastern china',
 'ivrea-verbano',
 'salta',
 'virunga',
 'eastern australia',
 'tarapacá',
 'khomas',
 'vianden',
 'western russia',
 'galicia',
 'central nepal',
 'australia',
 'bismarck archipelago',
 'manyara',
 'québec',
 'erongo',
 'tamil nadu',
 'eastern canada',
 'arusha',
 'sud',
 'iguape',
 'rhineland-palatinate',
 'north west',
 'jiangsu',
 'dodoma',
 'heard island',
 'mcdonald islands',
 'labrador',
 'iowa',
 'banks peninsula, christchurch',
 'nova scotia',
 'garratoxa',
 'kansas',
 'tristan da cunha',
 'gough',
 'northern tanzania',
 'chile & argentina',
 'vogelsaug',
 'mashonaland',
 'apennines',
 'kastamonu',
 'piemonte',
 'mutasa',
 'oberbergau',
 'ascension',
 'masvingo',
 'southern brazil',
 'northwest territories',
 'western iran',
 'st. helena',
 'kilimanjaro region',
 'kunene',
 'bouvet',
 'viti levu, tavua',
 'kamchatka',
 "sana'a and amran",
 'victoria',
 'bougainville',
 'norther pacific',
 'new hebrides',
 'kgalagadi',
 'niuatoputapu',
 'north-east',
 'western cordillera',
 'montechristi',
 'new britain',
 'santa isabel',
 'malaita',
 'gorgona island',
 'gansu province',
 'michoacán ',
 'puebla',
 'tushka',
 'campanian',
 'pozzuoli/ campanian',
 'roman',
 'red sea area',
 'sinai',
 'western desert',
 'akranes',
 'hvalfjarðarsveit',
 'borgarbyggð',
 'grundarfjörður',
 'grundarfjarðarbær',
 'gullbringusýsla',
 'west tushka',
 'hornafjörður',
 'chelungpu fault',
 'hvalfjarðarstrandarhreppur',
 'hveragerði',
 'antalya',
 'icel',
 'mugla',
 'konya',
 'burgur',
 'nigde',
 'diayarbakir',
 'bursa',
 'aksaray',
 'kayseri',
 'ayfonkarahisar',
 'afyonkarahisar',
 'kutahya',
 'izmir',
 'izmir or balikesir',
 'kirsehir',
 'sivas',
 'marmara island',
 'provinz pontevedra',
 'mosfellsbær',
 'snæfellsbær',
 'reykjavík',
 'skorradalshreppur',
 'skútustaðahreppur',
 'kerala',
 'vesturland',
 'nevada',
 'north german basin',
 'alberta',
 'hessen',
 'tirol',
 'ile de france',
 'campanian area',
 'rome',
 'sicily',
 'messina',
 'rhode island',
 'niedersachsen',
 'east midlands',
 'south iceland',
 'lahendong',
 'washington state',
 'british columbia',
 'suðurnes',
 'kyūshū',
 'waikato',
 'colima',
 'jalisco ',
 'giresun',
 'gümüshane',
 'lower saxony'
 'niedersachsen',
 'baden würtemberg',
 'rheinland pfalz',
 'bengal basin']

In [803]:
with open ('list_states.txt', 'w') as f:
    for item in list_state_det:
        try:
            f.write(str(item) + '\n')
        except:
            pass

## PDF Data Reader

In [2]:
def get_pdf_file_content_Text(path_to_pdf):
    
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''
    
    resource_manager = PDFResourceManager(caching=True)
    
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams()
    
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)

    '''
    We are going to process the content of each page of the original PDF File
    '''
    for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True):
        interpreter.process_page(page)

    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()

    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()
    
    '''
    Return the final variable containing all the text of the PDF
    '''
    return text

In [3]:
pdf_path = 'C:/Users/Ryan Bobby A/Documents/RWTH AACHEN/KULIAH/THESIS_BISMILLAH/Thesis Progress/PDF Data/Brehme2016_Article_PermeabilityDistributionInTheL.pdf'

In [4]:
text = get_pdf_file_content_Text(pdf_path)
print(text)

Environ Earth Sci (2016) 75:1088
DOI 10.1007/s12665-016-5878-9

O R I G I N A L A R T I C L E

Permeability distribution in the Lahendong geothermal ﬁeld:
A blind fault captured by thermal–hydraulic simulation

Maren Brehme1
Martin Sauter3

• Guido Blo¨cher1
• Gu¨ nter Zimmermann1

• Mauro Cacace1

• Yustin Kamah2

•

Received: 24 November 2015 / Accepted: 4 July 2016 / Published online: 19 July 2016
Ó Springer-Verlag Berlin Heidelberg 2016

Abstract Subsurface ﬂuid ﬂow of reservoirs in active
tectonic regions is mainly controlled by permeability of
fault zones. Therefore, the characterization of fault zones is
an important step toward performance assessment of a
reservoir. The ﬂuid ﬂow is controlled also by pressure and
temperature conditions.
In this context, we simulated
pressure and temperature ﬁelds to elaborate on the inﬂu-
ence of permeability on subsurface ﬂuid ﬂow in the
Lahendong geothermal reservoir. Thermal–hydraulic sim-
ulation is performed using a ﬁnite element approach.

In [5]:
text

'Environ Earth Sci (2016) 75:1088\nDOI 10.1007/s12665-016-5878-9\n\nO R I G I N A L A R T I C L E\n\nPermeability distribution in the Lahendong geothermal ﬁeld:\nA blind fault captured by thermal–hydraulic simulation\n\nMaren Brehme1\nMartin Sauter3\n\n• Guido Blo¨cher1\n• Gu¨ nter Zimmermann1\n\n• Mauro Cacace1\n\n• Yustin Kamah2\n\n•\n\nReceived: 24 November 2015 / Accepted: 4 July 2016 / Published online: 19 July 2016\nÓ Springer-Verlag Berlin Heidelberg 2016\n\nAbstract Subsurface ﬂuid ﬂow of reservoirs in active\ntectonic regions is mainly controlled by permeability of\nfault zones. Therefore, the characterization of fault zones is\nan important step toward performance assessment of a\nreservoir. The ﬂuid ﬂow is controlled also by pressure and\ntemperature conditions.\nIn this context, we simulated\npressure and temperature ﬁelds to elaborate on the inﬂu-\nence of permeability on subsurface ﬂuid ﬂow in the\nLahendong geothermal reservoir. Thermal–hydraulic sim-\nulation is perform

## Data Preprocessing (Conditioning, Stop Words, Stemmer)

### Regex Conditioning & NLTK StopWords

In [6]:
#Remove '\n' & '•'
new_text = re.sub(r'[\n•]', ' ', text)
new_text

'Environ Earth Sci (2016) 75:1088 DOI 10.1007/s12665-016-5878-9  O R I G I N A L A R T I C L E  Permeability distribution in the Lahendong geothermal ﬁeld: A blind fault captured by thermal–hydraulic simulation  Maren Brehme1 Martin Sauter3    Guido Blo¨cher1   Gu¨ nter Zimmermann1    Mauro Cacace1    Yustin Kamah2     Received: 24 November 2015 / Accepted: 4 July 2016 / Published online: 19 July 2016 Ó Springer-Verlag Berlin Heidelberg 2016  Abstract Subsurface ﬂuid ﬂow of reservoirs in active tectonic regions is mainly controlled by permeability of fault zones. Therefore, the characterization of fault zones is an important step toward performance assessment of a reservoir. The ﬂuid ﬂow is controlled also by pressure and temperature conditions. In this context, we simulated pressure and temperature ﬁelds to elaborate on the inﬂu- ence of permeability on subsurface ﬂuid ﬂow in the Lahendong geothermal reservoir. Thermal–hydraulic sim- ulation is performed using a ﬁnite element approach

In [7]:
#Remove '- ' and '  '
new_text = re.sub(r'- ', '', new_text)
new_text = re.sub(r'  ', ' ', new_text)
new_text

'Environ Earth Sci (2016) 75:1088 DOI 10.1007/s12665-016-5878-9 O R I G I N A L A R T I C L E Permeability distribution in the Lahendong geothermal ﬁeld: A blind fault captured by thermal–hydraulic simulation Maren Brehme1 Martin Sauter3  Guido Blo¨cher1  Gu¨ nter Zimmermann1  Mauro Cacace1  Yustin Kamah2   Received: 24 November 2015 / Accepted: 4 July 2016 / Published online: 19 July 2016 Ó Springer-Verlag Berlin Heidelberg 2016 Abstract Subsurface ﬂuid ﬂow of reservoirs in active tectonic regions is mainly controlled by permeability of fault zones. Therefore, the characterization of fault zones is an important step toward performance assessment of a reservoir. The ﬂuid ﬂow is controlled also by pressure and temperature conditions. In this context, we simulated pressure and temperature ﬁelds to elaborate on the inﬂuence of permeability on subsurface ﬂuid ﬂow in the Lahendong geothermal reservoir. Thermal–hydraulic simulation is performed using a ﬁnite element approach. Adjusting the p

In [8]:
#Apply the Stopwords from NLTK
nltk.download('stopwords')
pattern = r",*(\s*\b(?:{}))\b".format("|".join(stopwords.words('english')))
text_stw = re.sub(pattern,'', new_text)
text_stw

[nltk_data] Downloading package stopwords to C:\Users\Ryan Bobby
[nltk_data]     A\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'Environ Earth Sci (2016) 75:1088 DOI 10.1007/s12665-016-5878-9 O R I G I N A L A R T I C L E Permeability distribution Lahendong geothermal ﬁeld: A blind fault captured thermal–hydraulic simulation Maren Brehme1 Martin Sauter3  Guido Blo¨cher1  Gu¨ nter Zimmermann1  Mauro Cacace1  Yustin Kamah2   Received: 24 November 2015 / Accepted: 4 July 2016 / Published online: 19 July 2016 Ó Springer-Verlag Berlin Heidelberg 2016 Abstract Subsurface ﬂuid ﬂow reservoirs active tectonic regions mainly controlled permeability fault zones. Therefore characterization fault zones important step toward performance assessment reservoir. The ﬂuid ﬂow controlled also pressure temperature conditions. In context simulated pressure temperature ﬁelds elaborate inﬂuence permeability subsurface ﬂuid ﬂow Lahendong geothermal reservoir. Thermal–hydraulic simulation performed using ﬁnite element approach. Adjusting permeability 370 different cases, modeling results converged observed data within misﬁt range 0–7 %.

In [93]:
#Check Chrono
pattern = r",*(\s*\b(?:{}))\b".format("|".join(map(str,list_chrono)))
grab = re.findall(pattern, new_text)
for i in range(len(grab)):
    grab[i] = re.sub(' ','',grab[i])
grab

['Pleistocene']

### SPACY Conditioning (Parsing)

In [109]:
#Splitted into sentences by Spacy
spc_text = spc(text_stw)

#Apply Lemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('goes')

ready = []
for j in spc_text.sents:
    #Conditioning for Splitting & Apply Lemmatizer
    text_1 = ' '.join([lemmatizer.lemmatize(i.text) for i in spc(j.text.lower()) if len(i)!=1 or i.is_punct or i.is_digit])
    if len(text_1) <=1 or bool(re.search('[A-Za-z]', text_1)) == False:
        pass
    else:
        ready.append(text_1)
    
    #No Acknowledgements
    if bool(re.search('(acknowledgments)', text_1)) == True:
        break
    else:
        continue
    
    #No References
    if bool(re.search('(references|reference)', text_1)) == True:
        break
    else:
        continue
        
#Ready for NLP NER
ready_nlp = {'Text':[],'Tag':[],'Entity':[]}
for i in ready:
    for j in grab:
        if bool(re.search(j.lower(), i)) == True:
            ready_nlp['Text'].append(i)
            ready_nlp['Tag'].append(j.lower())
            ready_nlp['Entity'].append('Chrono')

In [113]:
ready

['environ earth sci ( 2016 ) 75:1088 doi 10.1007 / s12665 - 016-',
 'permeability distribution lahendong geothermal ﬁeld : blind fault captured thermal – hydraulic simulation maren brehme1 martin sauter3 guido blo¨cher1 gu¨ nter zimmermann1 mauro cacace1 yustin kamah2    received : 24 november 2015 / accepted : 4 july 2016 / published online : 19 july 2016 springer - verlag berlin heidelberg 2016',
 'abstract subsurface ﬂuid ﬂow reservoirs active tectonic regions mainly controlled permeability fault zones .',
 'therefore characterization fault zones important step toward performance assessment reservoir .',
 'the ﬂuid ﬂow controlled also pressure temperature conditions .',
 'in context simulated pressure temperature ﬁelds elaborate inﬂuence permeability subsurface ﬂuid ﬂow lahendong geothermal reservoir .',
 'thermal – hydraulic simulation performed using ﬁnite element approach .',
 'adjusting permeability 370 different cases , modeling results converged observed data within misﬁt rang

In [110]:
ready_nlp

{'Text': ['lithology lahendong area mainly characterized pre- , postand tondano formation plioto pleistocene age ( koestono et al . 2010 ) .'],
 'Tag': ['pleistocene'],
 'Entity': ['Chrono']}

## One Hot Encoding

In [None]:
voc_size = 10000
onehot_repr = [one_hot(words, voc_size) for words in text_stw_sent]
onehot_repr

## Add Pading

In [None]:
sent_length = 20
embedded_docs = pad_sequences(onehot_repr, padding = 'pre', maxlen=sent_length)
print(embedded_docs)

## OHE to Vector + Model

In [None]:
embedding_vector_features=40
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length = sent_length))
model.add(LSTM(100)) #LSTM with 100 neurons
model.add(Dense(32, activation = 'relu')) #Our Output is is Classesy. SO it's better relu
model.compile(optimizer='adam', metrics=['accuracy'])
print(model.summary())