In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib
import pandas as pd
import re
import datetime
import numpy as np
import pickle
import requests
from time import sleep
import pandas as pd
import random

## changelog
 + 1.0.1 -- Add Journal eISSN, and Categories

In [2]:
version = "1.0.2"

# Get the article list

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import ui
from selenium.webdriver.support.ui import WebDriverWait, Select

from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

In [4]:
def gedjournalarticles(journal, filename ):
    # open old file
    olddf = pd.DataFrame({'url': [] })
    try:
        olddf = pd.read_csv(filename, sep="|")
        print(olddf.tail())
    except:
        pass
    
    driver.get(journal['address'])
    print(driver.current_url)
    
    # itterate over the journal searching to get the Article list
    articles = []
    def pagefind():
        # get all of the Article on this page
        content = driver.page_source
        # load the page content in BeautifulSoup
        soup = BeautifulSoup(content, features="lxml")
        # found the Article
        for h5 in soup.find_all("h5"):
            # get link 
            for a in h5.find_all("a"):        
                articles.append("https://www.cambridge.org"+a['href'])
    
        # find next page
        for ulp in driver.find_elements_by_class_name('pagination'):    
            for lis in ulp.find_elements_by_tag_name('a'):
                if re.search('Next', lis.text):
                    # go to the next page
                    lis.click()
                    sleep(2)
                    return True
        return False

    nextpage = True
    while  nextpage:
        nextpage = pagefind()                
        print(driver.current_url, end='\r')
        
    # filer out old articles
    oldarticles = olddf['url']
    articles = list( set(articles).difference(set(oldarticles) ))
    print(len(articles), "new article found!")
    
    # make df
    df = pd.DataFrame({'url': list(set(articles)), 
                       'journal_title': journal['title'], 
                       'journal_eissn': journal['eissn'],
                       'category': journal['category']
                      })
    
    return df, olddf

# Process new articles

In [5]:
def getarticledetails(df, olddf, filename):
    
    articles = df['url'].values

    titles = [ None for _ in range(len(df))]
    abstracts = [ None for _ in range(len(df))]
    writers = [ None for _ in range(len(df))]
    dates = [ None for _ in range(len(df))]
    dois = [  None for _ in range(len(df)) ]
    keywords = [  None for _ in range(len(df)) ]

    for idx in range(len(articles)):
        # print percentiage of the process
        print( str(np.round(100*idx/len(df),2))+"%" , end='\r')
    
        url = df.iloc[idx]['url']
        request = urllib.request.Request(url)
        request.add_header('Accept-Encoding', 'utf-8')
        try:
            response = urlopen(request)    
            page_content = response.read().decode('utf-8')
        except:
            continue
    
        page_soup = BeautifulSoup(page_content, features="lxml")
        
        # abstract
        abstract = ""
        for abstdiv in page_soup.findAll("div", {'class':['abstract']}):
            for ap in abstdiv.find_all("p"):
                abstract = abstract + (" ".join(ap.get_text().splitlines()).replace("\xa0", " ") )
            # we need a break as if there is multiple abstract translation we go to truble
            break
        if len(abstract) > 0:
            abstracts[idx] = abstract
        else:
            # if no abstract we ignore the page
            continue

        # title
        titles[idx] = page_soup.find('title').get_text().split("|")[0] 
        # if all title is capital we change it just the first letter
        fullcapital = True
        for l in titles[idx]:
            if l.isupper() is False:
                fullcapital = False
        if fullcapital:
            titles[idx] = titles[idx].lower().title()
            
        # writer and publish date
        writter = []
        for m in page_soup.find_all("meta"):
            if m.has_attr("name"):
                if m['name'] == "citation_author":
                    writter.append( m['content'].title() )
                # add the author institute to the author
                elif m['name'] == "citation_author_institution":
                    writter[-1] = writter[-1] + "--" + m['content']
                # add writer orcid 
                elif m['name'] == "citation_author_orcid":
                    writter[-1] = writter[-1] + "---" + m['content']
                elif m['name'] == "citation_online_date":
                    # change to ISO dateformat
                    dates[idx] = datetime.datetime.strptime(m['content'], "%Y/%m/%d").strftime('%Y-%m-%d')
                elif m['name'] == "citation_doi":
                    dois[idx] = m['content']
        if len(writter) > 0:
            writers[idx] =  "#".join(writter)
        
        # keywords
        key = []
        for keya in page_soup.findAll("a", {'class':['keywords']}):
            # this is because there is some keywords like: 
            #  "13A35: Characteristic $p$ methods (Frobenius endomorphism) and reduction to characteristic $p$; tight closure"
            if re.search('.:.', keya.text):
                for t in keya.text.split(":")[1].split(";"):
                    key.append(t.replace('$', ''))
            else:
                for t in keya.text.split(";"):
                    key.append(t.replace('$', ''))
        if len(key) > 0:
            keywords[idx] =  "#".join(key)
    
    # extend the df
    df['title'] = titles
    df['doi'] = dois
    df['abstract'] = abstracts
    df['writer'] = writers
    df['publishdate'] = dates
    df['keyword'] = keywords
    
    # merge owith old df
    df = pd.concat([df, olddf])
    
    # save data
    df.to_csv(filename, sep="|", index=False)
    
    # test
    test = pd.read_csv(filename, sep="|")
    print(test.tail())

# Main

In [6]:
journals = [    
    { 
        'title': 'Journal_of_Glaciology',
        'eissn': '1727-5652', 
        'category': 'Earth Science', 
        'address': 'https://www.cambridge.org/core/journals/journal-of-glaciology/listing?q=&searchWithinIds=FE8284B2577ADBCD299821FDE4E752F1&aggs[productTypes][filters]=JOURNAL_ARTICLE'
    },
    { 
        'title': 'Annals_of_Glaciology', 
        'eissn': '1727-5644', 
        'category': 'Earth Science', 
        'address': 'https://www.cambridge.org/core/journals/annals-of-glaciology/listing?q=&searchWithinIds=DD7BD3FCBF82BF09609E5A0E1888ED7B&aggs[productTypes][filters]=JOURNAL_ARTICLE'
    }, 
    { 
        'title': 'High Power Laser Science and Engineering', 
        'eissn': '2052-3289', 
        'category': 'Physics', 
        'address': 'https://www.cambridge.org/core/journals/high-power-laser-science-and-engineering/listing?q=&_csrf=ti7jcDO3-AxKimC_iYt0RDFF5syy68KMK0hI&searchWithinIds=D30FF81AE5FAEE26735889C8553C99DD'
    },  
    { 
        'title': 'Primary Health Care Research & Development', 
        'eissn': '1477-1128', 
        'category': 'Medicine', 
        'address': 'https://www.cambridge.org/core/journals/primary-health-care-research-and-development/listing?q=&_csrf=qay3IsJl-DywfWQ2JUFDOvSLEA6nCPV73z04&searchWithinIds=32C9D453D5FCE02D20FE22C0CF2F0970'
    },  
    { 
        'title': 'Netherlands Journal of Geosciences', 
        'eissn': '1573-9708', 
        'category': 'Earth Science', 
        'address': 'https://www.cambridge.org/core/journals/netherlands-journal-of-geosciences/listing?q=&_csrf=TXboDrRA-h-4v-k0nf-bkCHcFm1Joyh-d4cg&searchWithinIds=8C3B5CF0FE961741867DC17FA6BDB2CB'
    },  
    { 
        'title': 'Global Mental Health', 
        'eissn': '2054-4251', 
        'category': 'Medicine', 
        'address': 'https://www.cambridge.org/core/journals/global-mental-health/listing?q=&_csrf=L1mTDoYq-31VcHm0Bxycx_KhYoLucPk_R9vA&searchWithinIds=0ED89DDEA3AE758E19BA3F5FA74ECDE4&fts=yes'
    },  
    { 
        'title': 'Genetics Research', 
        'eissn': '1469-5073', 
        'category': 'Biology', 
        'address': 'https://www.cambridge.org/core/journals/genetics-research/listing?q=&_csrf=zzy8w5ol-29o9LFI9G4WCPPTCPAZY6Z-0Zho&searchWithinIds=0D1E541454FA09167A8FA4A8D0CEBA16'
    },  
    { 
        'title': 'Forum of Mathematics, Sigma', 
        'eissn': '2050-5094', 
        'category': 'Mathematics', 
        'address': 'https://www.cambridge.org/core/journals/forum-of-mathematics-sigma/listing?q=&_csrf=nXfofpf4-UOsPOmBj8_sgGUHEE2hFObofEAA&searchWithinIds=FFA2827E377ED5D335BA35E30804D5A3'
    },  
    { 
        'title': 'Forum of Mathematics, Pi', 
        'eissn': '2050-5086', 
        'category': 'Mathematics', 
        'address': 'https://www.cambridge.org/core/journals/forum-of-mathematics-pi/listing?q=&_csrf=aiEo8OfS-Dy7S0KF_IsztxL2ynSRMKh0rl-Q&searchWithinIds=B1E45CD3455435D58D82530624D73954'
    },  
    { 
        'title': 'European Psychiatry', 
        'eissn': '1778-3585', 
        'category': 'Medicine', 
        'address': 'https://www.cambridge.org/core/journals/european-psychiatry/listing?q=&_csrf=rLcBO1az-jbyjRYF0q89ctp24CKNdnAiX7qI&searchWithinIds=6971C5AE9A4D702D7062BB035666A014%2C7620ABABD5F68A2888B2B8D57AF0EEEF%2CEC14C12B308EF39D62154DD70A2C8B72'
    },
    { 
        'title': 'Epidemiology and Psychiatric Sciences', 
        'eissn': '2045-7979', 
        'category': 'Medicine', 
        'address': 'https://www.cambridge.org/core/journals/epidemiology-and-psychiatric-sciences/listing?q=&_csrf=EGDyUg2p-CgV_fV7h_zEo59RGJKAnWZpyMR4&searchWithinIds=8DF8693744B64155A5D367EE22D818E4'
    }, 
    { 
        'title': 'Epidemiology & Infection', 
        'eissn': '1469-4409', 
        'category': 'Medicine', 
        'address': 'https://www.cambridge.org/core/journals/epidemiology-and-infection/listing?q=&_csrf=lJAoW8qy-OoxGqIAPj6_XrkTcRaRP7hB05Ik&searchWithinIds=6BD9A1C3FE054EFD604833E60853FD77&fts=yes'
    }, 
    { 
        'title': 'BJPsych Open', 
        'eissn': '2056-4724', 
        'category': 'Medicine', 
        'address': 'https://www.cambridge.org/core/journals/bjpsych-open/listing?q=&productId=487612A06240672671A5E3C738A37C86&context=%2Fjournals%2Fbjpsych-open&type=journal'
    }
]


In [None]:
random.shuffle(journals)
for jidx in range(len(journals)):
    filename = 'data/journal_'+ journals[jidx]['title'] +'_'+version+'.csv'
    
    # search for articles
    df, olddf = gedjournalarticles(journals[jidx], filename )
    
    # get the articles details
    getarticledetails(df, olddf, filename)

https://www.cambridge.org/core/journals/journal-of-glaciology/listing?q=&searchWithinIds=FE8284B2577ADBCD299821FDE4E752F1&aggs[productTypes][filters]=JOURNAL_ARTICLE
5543 new article found!rg/core/journals/journal-of-glaciology/listing?aggs%5BproductTypes%5D%5Bfilters%5D=JOURNAL_ARTICLE&pageNum=277&searchWithinIds=FE8284B2577ADBCD299821FDE4E752F1
                                                    url  \
5538  https://www.cambridge.org/core/journals/journa...   
5539  https://www.cambridge.org/core/journals/journa...   
5540  https://www.cambridge.org/core/journals/journa...   
5541  https://www.cambridge.org/core/journals/journa...   
5542  https://www.cambridge.org/core/journals/journa...   

              journal_title journal_eissn       category  \
5538  Journal_of_Glaciology     1727-5652  Earth Science   
5539  Journal_of_Glaciology     1727-5652  Earth Science   
5540  Journal_of_Glaciology     1727-5652  Earth Science   
5541  Journal_of_Glaciology     1727-5652  Earth Science