In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib
import pandas as pd
import re
import datetime
import numpy as np
import pickle
import requests
from time import sleep
import pandas as pd

## changelog
 + 1.0.1 -- Add Journal eISSN, and Categories

In [2]:
version = "1.0.1"

# Get the article list

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import ui
from selenium.webdriver.support.ui import WebDriverWait, Select

from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

In [4]:
def gedjournalarticles(journal, filename ):
    # open old file
    olddf = pd.DataFrame({'url': [], 'journal': [] })
    try:
        olddf = pd.read_csv(filename, sep="|")
        print(olddf.tail())
    except:
        pass
    
    driver.get(journal['address'])
    print(driver.current_url)
    
    # itterate over the journal searching to get the Article list
    articles = []
    def pagefind():
        # get all of the Article on this page
        content = driver.page_source
        # load the page content in BeautifulSoup
        soup = BeautifulSoup(content, features="lxml")
        # found the Article
        for h5 in soup.find_all("h5"):
            # get link 
            for a in h5.find_all("a"):        
                articles.append("https://www.cambridge.org"+a['href'])
    
        # find next page
        for ulp in driver.find_elements_by_class_name('pagination'):    
            for lis in ulp.find_elements_by_tag_name('a'):
                if re.search('Next', lis.text):
                    # go to the next page
                    lis.click()
                    sleep(2)
                    return True
        return False

    nextpage = True
    while  nextpage:
        nextpage = pagefind()                
        print(driver.current_url, end='\r')
        
    # filer out old articles
    oldarticles = olddf['url']
    articles = list( set(articles).difference(set(oldarticles) ))
    print(len(articles), "new article found!")
    
    # make df
    df = pd.DataFrame({'url': list(set(articles)), 
                       'journal_title': journal['title'], 
                       'journal_eissn': journal['eissn'],
                       'category': journal['category']
                      })
    
    return df, olddf

# Process new articles

In [5]:
def getarticledetails(df, olddf, filename):
    
    articles = df['url'].values

    titles = [ None for _ in range(len(df))]
    abstracts = [ None for _ in range(len(df))]
    writers = [ None for _ in range(len(df))]
    dates = [ None for _ in range(len(df))]
    dois = [  None for _ in range(len(df)) ]
    keywords = [  None for _ in range(len(df)) ]

    for idx in range(len(articles)):
        # print percentiage of the process
        print( str(np.round(100*idx/len(df),2))+"%" , end='\r')
    
        url = df.iloc[idx]['url']
        request = urllib.request.Request(url)
        request.add_header('Accept-Encoding', 'utf-8')
        try:
            response = urlopen(request)    
            page_content = response.read().decode('utf-8')
        except:
            continue
    
        page_soup = BeautifulSoup(page_content, features="lxml", from_encoding='utf-8')
        
        # abstract
        abstract = ""
        for abstdiv in page_soup.findAll("div", {'class':['abstract']}):
            for ap in abstdiv.find_all("p"):
                abstract = abstract + (" ".join(ap.get_text().splitlines()).replace("\xa0", " ") )
        if len(abstract) > 0:
            abstracts[idx] = abstract
        else:
            # if no abstract we ignore the page
            continue

        # title
        for t in page_soup.find_all('title'):        
            titles[idx] = t.get_text().split("|")[0]                
            
        # writer and publish date
        writter = []
        for m in page_soup.find_all("meta"):
            if m.has_attr("name"):
                if m['name'] == "citation_author":
                    writter.append( m['content'] )
                # add the author institute to the author
                elif m['name'] == "citation_author_institution":
                    writter[-1] = writter[-1] + "--" + m['content']
                # add writer orcid 
                elif m['name'] == "citation_author_orcid":
                    writter[-1] = writter[-1] + "---" + m['content']
                elif m['name'] == "citation_online_date":
                    # change to ISO dateformat
                    dates[idx] = datetime.datetime.strptime(m['content'], "%Y/%m/%d").strftime('%Y-%m-%d')
                elif m['name'] == "citation_doi":
                    dois[idx] = m['content']
        if len(writter) > 0:
            writers[idx] =  "#".join(writter)
        
        # keywords
        key = []
        for keya in page_soup.findAll("a", {'class':['keywords']}):
            key.append(keya.text)
        if len(key) > 0:
            keywords[idx] =  "#".join(key)
    
    # extend the df
    df['title'] = titles
    df['doi'] = dois
    df['abstract'] = abstracts
    df['writer'] = writers
    df['publishdate'] = dates
    df['keyword'] = keywords
    
    # merge owith old df
    df = pd.concat([df, olddf])
    
    # save data
    df.to_csv(filename, sep="|", index=False)
    
    # test
    test = pd.read_csv(filename, sep="|")
    print(test.tail())

# Main

In [6]:
journals = [
    { 
        'title': 'Annals_of_Glaciology', 
        'eissn': '1727-5644', 
        'category': 'Earth Science', 
        'address': 'https://www.cambridge.org/core/journals/annals-of-glaciology/listing?q=&searchWithinIds=DD7BD3FCBF82BF09609E5A0E1888ED7B&aggs[productTypes][filters]=JOURNAL_ARTICLE'
    }, 
    { 
        'title': 'Journal_of_Glaciology',
        'eissn': '1727-5652', 
        'category': 'Earth Science', 
        'address': 'https://www.cambridge.org/core/journals/journal-of-glaciology/listing?q=&searchWithinIds=FE8284B2577ADBCD299821FDE4E752F1&aggs[productTypes][filters]=JOURNAL_ARTICLE'
    } ]


In [7]:
for jidx in range(len(journals)):
    filename = filename = 'data/journal_'+ journals[jidx]['title'] +'_'+version+'.csv'
    
    # search for articles
    df, olddf = gedjournalarticles(journals[jidx], filename )
    
    # get the articles details
    getarticledetails(df, olddf, filename)

https://www.cambridge.org/core/journals/annals-of-glaciology/listing?q=&searchWithinIds=DD7BD3FCBF82BF09609E5A0E1888ED7B&aggs[productTypes][filters]=JOURNAL_ARTICLE
3855 new article found!rg/core/journals/annals-of-glaciology/listing?aggs%5BproductTypes%5D%5Bfilters%5D=JOURNAL_ARTICLE&pageNum=193&searchWithinIds=DD7BD3FCBF82BF09609E5A0E1888ED7B
0.0%



                                                    url         journal_title  \
3850  https://www.cambridge.org/core/journals/annals...  Annals_of_Glaciology   
3851  https://www.cambridge.org/core/journals/annals...  Annals_of_Glaciology   
3852  https://www.cambridge.org/core/journals/annals...  Annals_of_Glaciology   
3853  https://www.cambridge.org/core/journals/annals...  Annals_of_Glaciology   
3854  https://www.cambridge.org/core/journals/annals...  Annals_of_Glaciology   

     journal_eissn       category  \
3850     1727-5644  Earth Science   
3851     1727-5644  Earth Science   
3852     1727-5644  Earth Science   
3853     1727-5644  Earth Science   
3854     1727-5644  Earth Science   

                                                  title  \
3850  Assessment of the surface mass balance along t...   
3851  The simulation of Antarctic sea ice in the Had...   
3852  In situ stress tensor measured in an Alaskan g...   
3853  Initial experiments on the effects of particle..



                                                    url  \
5497  https://www.cambridge.org/core/journals/journa...   
5498  https://www.cambridge.org/core/journals/journa...   
5499  https://www.cambridge.org/core/journals/journa...   
5500  https://www.cambridge.org/core/journals/journa...   
5501  https://www.cambridge.org/core/journals/journa...   

              journal_title journal_eissn       category  \
5497  Journal_of_Glaciology     1727-5652  Earth Science   
5498  Journal_of_Glaciology     1727-5652  Earth Science   
5499  Journal_of_Glaciology     1727-5652  Earth Science   
5500  Journal_of_Glaciology     1727-5652  Earth Science   
5501  Journal_of_Glaciology     1727-5652  Earth Science   

                                                  title  \
5497                                                NaN   
5498  A 5 year record of surface energy and mass bal...   
5499  Borehole imagery of meteoric and marine ice la...   
5500  Progress in glacial geology during the las