# Module Needed

In [13]:
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
    
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()

## Initialize client
client = ElsClient(config['apikey'])
client.inst_token = config['insttoken']

import re
import regex

#NLP Processing Spacy
import spacy
# import spacy_transformers
# spc = spacy.load('en_core_web_trf')
spc = spacy.load('en_core_web_lg')
# config = {"punct_chars": [".", "?"]}
spc.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x20e7e307c40>

# Define The Needed Function

In [2]:
def start_end(b):
    c = b.lower()
    try:
        awal = [(m.start(0), m.end(0)) for m in re.finditer(r'1 introduction', c)]
        awal_ind = awal[-1][-1]
    except:
        try:
            awal = [(m.start(0), m.end(0)) for m in re.finditer(r'keywords', c)]
            awal_ind = awal[0][-1]
        except:
            awal_ind = 0

    akhir = [(m.start(0), m.end(0)) for m in re.finditer(r'credit authorship contribution statement', c)]
    if len(akhir) == 0:
        try:
            akhir = [(m.start(0), m.end(0)) for m in re.finditer(r'declaration of competing interest', c)]
            akhir_ind = akhir[-1][0]
        except:
            try:
                akhir = [(m.start(0), m.end(0)) for m in re.finditer(r'acknowledgments', c)]
                akhir_ind = akhir[-1][0]
            except:
                try:
                    akhir = [(m.start(0), m.end(0)) for m in re.finditer(r'references', c)]
                    akhir_ind = akhir[-1][0] 
                except:
                    akhir_ind = None
    else:
        akhir_ind = akhir[-1][0]
    
    return b[awal_ind:akhir_ind]

def preprocessing_v2(text_pdfmine):
    #Punctuation of '-' because make a newline 
    layers = re.sub(r'(-\n?\s)','',text_pdfmine)
    
    #Remove spacing n dot '\n' & '•'
    if bool(regex.search(r'(?<=(\w\w))[\n•](?=(\w\w))',layers))==True:
        layers = re.sub(r'[\n•]', ' ', layers)
    else:
        layers = re.sub(r'[\n•]', '', layers)
        
    #Adjustment in Germany alphabetic
    layers = re.sub(r'(?<=[oua])(¨)','e',layers)
    layers = re.sub(r'ß','ss',layers)
    
    #Delete et al. + lower
    layers = regex.sub(r'et al.','with colleagues',layers)
    
    #Eliminate fig and table
    layers = regex.sub(r'(\b[Ff]igs*\.*\b)', 'Figure', layers)
    layers = regex.sub(r'(?<=[Tt]ables*)[\s+]*[\d+]*\.*', ' ',layers)
    
    #Eliminate email
    layers = regex.sub(r'\b[\w-.]+?@\w+?.\w+[\w\.\-]+\b',' ',layers)

    #Eliminate website
    layers = regex.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w\s_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',' ',layers)
    
    #Adjusment in Over-Spacing
    layers = regex.sub(r'(\s)(?=\s+)','',layers)
    
    #Adjutsment Formation in Geology
    layers = regex.sub(r'[Ff]m.','Formation',layers)
    
    #Splitted into sentences by Spacy
    spc_text = spc(layers)

    #Preparation
    prepro2=[]
    
    #List of Sentences
    for i,j in enumerate(spc_text.sents):
        if bool(regex.search(r'^,',j.text)) == False:
            prepro2.append(j.text)
        else:
            prepro2[-1] = prepro2[-1] + j.text
    
    #Pattern for remove bracket
    pattern1 = regex.compile(r'{}[^{}]+{}'.format('\(','\)','\)'))
    
    #Pattern for remove bracket
    pattern2 = regex.compile(r'{}[^{}]+{}'.format('\[','\]','\]'))
    
    #Pattern for unused space first n last
    pattern3 = regex.compile(r'((^\s+)(?=.)|(?<=.)(\s+$))')

    #Pattern for Apply over-space
    pattern4 = regex.compile(r'(\s)(?=\s+)')

    #Pattern for cleaning
    for i,j in enumerate(prepro2):
        
        #Remove Bracket
        layer = regex.sub(pattern1, ' ', j)
        
        #Remove Bracket
        layer = regex.sub(pattern2, ' ', layer)
        
        #With Removing unused space first n las
        layer = regex.sub(pattern3,'',layer)
        
        #With Removing over-space
        prepro2[i] = regex.sub(pattern4,'',layer)

    #Final filtering for less than three words sentence
    d = lambda y : None if len(y.split(' '))<=4 else y
    prepro2 = list(filter(d, prepro2))
    
    return prepro2

# DOI Scraping

In [4]:
focus_geology = ['tectonic','petrology','remote sensing','structural', 'palaeontology','sedimentology','geomorphology','stratigraphy','geochronology','geochemistry']

## Initialize doc search object using ScienceDirect and execute search,
#   retrieving all results
list_result = {}
for i in focus_geology:
    doc_srch = ElsSearch(i + " geology energy",'sciencedirect')
    doc_srch.execute(client, get_all = True)
    print (f"doc_srch for {i} geology energy has:", len(doc_srch.results), "results.")
    list_result[i + " geology energy"] = doc_srch.results
    
list_doi = []
for i in focus_geology:
    for j in range(250):
        list_doi.append(list_result[i + " geology energy"][j]['prism:doi'])

doc_srch for tectonic geology energy has: 500 results.
doc_srch for petrology geology energy has: 500 results.
doc_srch for remote sensing geology energy has: 500 results.
doc_srch for structural geology energy has: 500 results.
doc_srch for palaeontology geology energy has: 500 results.
doc_srch for sedimentology geology energy has: 500 results.
doc_srch for geomorphology geology energy has: 500 results.
doc_srch for stratigraphy geology energy has: 500 results.
doc_srch for geochronology geology energy has: 500 results.
doc_srch for geochemistry geology energy has: 500 results.


# Text Scraping

In [5]:
## ScienceDirect (full-text) document example using DOI
success = []
failed = []
with open('extracted_elsevier_2.txt', 'w', encoding='utf-8') as f:
    for i in list_doi:
        doi_doc = FullDoc(doi = i)
        try:
            if doi_doc.read(client):
                a = start_end(doi_doc.data["originalText"])
                print('yes')
                f.write(a)
                f.write('\n')
                f.write('----END----')
                f.write('\n')
                f.write('----NEW----')
                f.write('\n')
                success.append(i)
                doi_doc.write()   
            else:
                print ("Read document failed.")
        except:
            failed.append(i)
            print('no')
        
    f.close()

no
no
no
yes
no
yes
no
no
no
no
no
no
yes
yes
no
no
no
no
no
no
no
no
no
yes
yes
yes
no
no
no
yes
no
no
yes
no
no
yes
no
no
yes
no
no
no
no
no
no
no
no
yes
no
no
yes
yes
no
no
yes
no
yes
no
no
no
no
no
no
no
no
yes
no
no
no
no
no
no
no
no
yes
no
no
yes
no
no
no
no
yes
no
no
no
no
no
yes
yes
no
no
no
no
no
yes
no
no
no
no
yes
no
yes
no
no
no
no
no
no
no
no
no
no
yes
no
no
no
no
yes
no
no
no
yes
no
yes
yes
yes
yes
yes
no
yes
no
no
no
yes
no
no
no
no
no
no
no
no
no
no
no
yes
yes
no
yes
yes
yes
no
no
no
no
no
yes
no
no
no
no
no
no
no
no
yes
yes
no
no
no
no
yes
yes
no
no
no
no
no
no
no
no
no
no
no
no
no
no
yes
no
yes
no
no
no
no
yes
no
no
no
yes
yes
no
no
no
no
no
no
no
yes
yes
no
no
no
no
no
no
no
no
no
no
yes
no
no
yes
no
no
no
no
no
yes
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
yes
no
no
no
yes
yes
no
no
no
yes
no
no
no
no
no
no
no
no
no
yes
no
no
no
no
no
yes
no
yes
yes
no
yes
no
no
no
no
no
no
no
no
no
no
no
no
yes
no
no
yes
no
yes
no
no
no
no
no
yes
no
no

no
yes
no
no
no
no
no
no
no
no
no
yes
no
no
no
no
yes
no
no
no
no
no
no
no
no


# Save the text data

In [6]:
with open('elsevier_success_2.txt', 'w', encoding='utf-8') as f:
    for i in success:
        f.write(i)
        f.write('\n')
    f.close()

In [11]:
len(success)

771

In [12]:
len(failed)

1729

In [None]:
with open('extracted_elsevier_2.txt', 'r', encoding='utf-8') as f:
    lines = f.read()
c = lines.split('\n----END----\n----NEW----\n')

cc = []
with open('text_elsevier_ready_2.txt', 'w', encoding='utf-8') as f:
    for i in c:
        le = preprocessing_v2(i)
        for k in le:
            f.write(k)
            f.write('\n')
        f.write('----END----')
        f.write('\n')
        f.write('----NEW----')
        f.write('\n')