In [1]:
version = "1.0.0"

# Read the Journals which we are searching for


In [2]:
import pandas as pd

journals = pd.read_csv('doaj_journals.csv', sep="|")
print(journals.head())

                                   title      eiisn           category  \
0                     Biomedical Journal  2320-2890           Medicine   
1  African Journal of Emergency Medicine  2211-4203           Medicine   
2          Ain Shams Engineering Journal  2090-4495  Civil Engineering   
3         Alexandria Engineering Journal  2090-2670  Civil Engineering   
4      Anais Brasileiros de Dermatologia  1806-4841           Medicine   

                              publisher      eissn      pissn  \
0                              Elsevier  2320-2890  2319-4170   
1                              Elsevier  2211-419X        NaN   
2                              Elsevier        NaN  2090-4479   
3                              Elsevier        NaN  1110-0168   
4  Sociedade Brasileira de Dermatologia        NaN  0365-0596   

                             doajid  \
0  d1e768538ffa4f82826f04f163a7fa38   
1  8150ee089d8b4f099f6c30bd8aaf0fcc   
2  5622b8878c204fd2b88130fbeea2f2d7   
3  40ae3

# Get the article details from DOAJ


In [3]:
def findjournaldata(results):
    publisher = None
    eissn = None
    pissn = None
    title = None
    doajid = None
    categories = None
    language = None
    notfound = True
    # iterate over the result and try to find where title exactly match
    for result in results:
            if result['bibjson']['title'].lower().strip() == journals.iloc[idx]['title'].lower().strip():
                notfound = False
                publisher = result['bibjson']['publisher']['name'] 
                if 'eissn' in result['bibjson'].keys():
                    eissn = result['bibjson']['eissn'] 
                if 'pissn' in result['bibjson'].keys():
                    pissn = result['bibjson']['pissn'] 

                title = result['bibjson']['title'] 
                doajid = result['id'] 
                categories =  "#".join(  [ s['term'].strip() for s in result['bibjson']['subject'] ] ) 
                language = "#".join(  [ s.strip() for s in result['bibjson']['language'] ] ) 
                
                break
    return notfound, publisher, eissn, pissn, title, doajid, categories, language

In [4]:
import requests
import urllib.parse
from time import sleep

publishers = []
eissns = []
pissns = []
titles = []
doajids = []
categoriess = []
languages = []
for idx in range(len(journals)):
    if 'doajid' not in journals.columns or journals.iloc[idx]['doajid'] is None:
        searchterm = urllib.parse.quote('bibjson.title:"'+journals.iloc[idx]['title']+'"')
        searchterm = 'https://doaj.org/api/v2/search/journals/'+searchterm
        x = requests.get(searchterm)
        notfound = True
        if x.status_code == 200:
            # iterate over pages
            jsonresult =  x.json()
            if 'total' in jsonresult.keys() :
                for page in range(int(jsonresult['total'])):
                    page = page+1
                    if 'results' in jsonresult.keys():                
                        notfound, publisher, eissn, pissn, title, doajid, categories, language = findjournaldata(jsonresult['results'])
                        if not notfound:
                            publishers.append(publisher)
                            eissns.append(eissn)
                            pissns.append(pissn)
                            titles.append(title)
                            doajids.append(doajid)
                            categoriess.append(categories)
                            languages.append(language)
                            break
                        else:
                            if jsonresult['total'] > 1:
                                # go to next page
                                x = requests.get(searchterm+"?page="+str(page)+"&pageSize=10")
                                print(searchterm+"?page="+str(page)+"&pageSize=10")
                                if x.status_code == 200:
                                    jsonresult =  x.json()
                                else:
                                    break
                
       
        if notfound:
            # test there is new page if yes go there        
            print("Error:", searchterm)
            publishers.append(None)
            eissns.append(None)
            pissns.append(None)
            titles.append(None)
            doajids.append(None)
            categoriess.append(None)
            languages.append(None)
        sleep(3)
    else:
        publishers.append(journals.iloc[idx]['publisher'])
        eissns.append(journals.iloc[idx]['eissn'])
        pissns.append(journals.iloc[idx]['pissn'])
        titles.append(journals.iloc[idx]['title'])
        doajids.append(journals.iloc[idx]['doajid'])
        categoriess.append(journals.iloc[idx]['categories'])
        languages.append(journals.iloc[idx]['language'])
    
# enrich journals
journals['publisher'] = publishers
journals['eissn'] = eissns
journals['pissn'] = pissns
journals['title'] = titles
journals['doajid'] = doajids
journals['categories'] = categoriess
journals['language'] = languages

journals.to_csv('doaj_journals.csv', sep="|", index=False)


In [5]:

journals.head()

Unnamed: 0,title,eiisn,category,publisher,eissn,pissn,doajid,categories,language
0,Biomedical Journal,2320-2890,Medicine,Elsevier,2320-2890,2319-4170,d1e768538ffa4f82826f04f163a7fa38,Medicine (General)#Biology (General),EN
1,African Journal of Emergency Medicine,2211-4203,Medicine,Elsevier,2211-419X,,8150ee089d8b4f099f6c30bd8aaf0fcc,Medicine#Medicine (General),EN
2,Ain Shams Engineering Journal,2090-4495,Civil Engineering,Elsevier,,2090-4479,5622b8878c204fd2b88130fbeea2f2d7,Engineering (General). Civil engineering (Gene...,EN
3,Alexandria Engineering Journal,2090-2670,Civil Engineering,Elsevier,,1110-0168,40ae382df340487c8266f817e3a03683,Engineering (General). Civil engineering (Gene...,EN
4,Anais Brasileiros de Dermatologia,1806-4841,Medicine,Sociedade Brasileira de Dermatologia,,0365-0596,bbbc2abf7b4a4a74a81784b75472ca54,Dermatology,EN#PT


# Get the articles for the Journal

In [6]:
def findarticledata(results):
    
    # iterrate over Articles
    url = []
    title = []
    doi = []
    abstract  = []
    writers = []
    publishdate = []
    keyword = []
    for result in results:

        # url
        urlfound = False
        for l in result['bibjson']["link"]:
            if l['type'] == "fulltext":
                urlfound = True
                url.append(  l['url'].replace("\n", "") )
                break
        if not urlfound:
            # if there no url we do not interest about the Article
            url.append(None)
            title.append(None)
            doi.append(None)
            abstract.append( None )
            writers.append(None)
            publishdate.append( None )
            keyword.append( None )
            continue
                
            
        # title
        if "title" in  result['bibjson']:
            title.append( result['bibjson']["title"].replace("\n", "") )
        else:
            # if there no title we do not interest about the Article
            title.append(None)
            doi.append(None)
            abstract.append( None )
            writers.append(None)
            publishdate.append( None )
            keyword.append( None )
            continue
        
        # doi
        founddoi = False
        for ide in result['bibjson']['identifier']:
            if ide["type"].lower() == "doi":
                founddoi = True
                doi.append( ide['id'].replace("\n", "") )
                break
        if not founddoi:
            doi.append(None)
        
        # abstract
        if "abstract" in  result['bibjson']:
            abstract.append( result['bibjson']["abstract"].replace("\n", '<br>') )
        else:
            # if there no abstract we do not interest about the Article
            abstract.append( None )
            writers.append(None)
            publishdate.append( None )
            keyword.append( None )
            continue
        
        # writer
        writer = ""
        for w in result['bibjson']["author"]:
            writer = writer + "#" + w['name'].replace("\n", "")
            if 'affiliation' in w.keys():
                writer = writer + "--" + w['affiliation'].replace("\n", "")
            if 'orcid_id' in w.keys():
                writer = writer + "---" + w['orcid_id'].replace("\n", "")          
        writers.append(writer)
        
        # publishdate
        if "created_date" in result.keys():
            publishdate.append( result["created_date"].replace("\n", "") )
        else:
            publishdate.append( None )
        
        # keywords
        if "keywords" in result['bibjson']:
            keytext = ""
            for k in result['bibjson']["keywords"]:
                if re.search("|", k):
                    keytext = keytext + k.replace("\n", "")
        else:
            keyword.append( None )

    df = pd.DataFrame({
        'url': url,
        'title': title,
        'doi': doi,
        'abstract': abstract,
        'writer': writers,
        'publishdate': publishdate,
        'keyword': keyword,
                      })
        
    return df

In [None]:
import csv
import numpy as np

# iterate over the journals
for idx in range(len(journals)): 
    print("load: " + journals.iloc[idx]['title'] + " Ready: " + str(np.round(100*idx/len(journals),2))+"%" , end='\r')
    searchterm = urllib.parse.quote('journal:"'+journals.iloc[idx]['title']+'" AND _exists_:doi AND _exists_:abstract  AND _exists_: "bibjson.author"  ')
    searchterm = 'https://doaj.org/api/v2/search/articles/'+searchterm
    print(searchterm)
    x = requests.get(searchterm +'?page=1&pageSize=100')
    notfound = True
    
    journaldf  = pd.DataFrame({
        'url': [],
        'title': [],
        'doi': [],
        'abstract': [],
        'writer': [],
        'publishdate': [],
        'keyword': [],
        'journal_title': [],
        'journal_eissn': [],
        'journal_pissn': [],
        'category': []
                      })
    
    if x.status_code == 200:
        # iterate over pages
        jsonresult =  x.json()
        if 'total' in jsonresult.keys() :
            pagenumber = int(jsonresult['total']//100)+1
            if jsonresult['total'] > 1000 :
                print("Can not collect all of the Article for", journals.iloc[idx]['title'], "Total:", jsonresult['total'])
            for page in range(pagenumber):
                if 'results' in jsonresult.keys():   
                    thispage = findarticledata(jsonresult['results'])
                    thispage['journal_title'] = journals.iloc[idx]['title']
                    thispage['journal_eissn'] = journals.iloc[idx]['eissn']
                    thispage['journal_pissn'] = journals.iloc[idx]['pissn']
                    thispage['category'] = journals.iloc[idx]['category']
                    
                    journaldf = pd.concat([journaldf, thispage])                    
                # go to othe next page
                if jsonresult['total'] > 1 and page != pagenumber-1 :
                    sleep(3)
                    # go to next page
                    x = requests.get(searchterm+"?page="+str(page+2)+"&pageSize=100")
                    if x.status_code == 200:
                        jsonresult =  x.json()
                    else:
                        break
                        
    # save journaldf
    journaldf.to_pickle('data/journal_'+journals.iloc[idx]['title'].replace(" ", "_").replace("&", "and")+".pandas" )    
    sleep(60)

https://doaj.org/api/v2/search/articles/journal%3A%22Biomedical%20Journal%22%20AND%20_exists_%3Adoi%20AND%20_exists_%3Aabstract%20%20AND%20_exists_%3A%20%22bibjson.author%22%20%20
https://doaj.org/api/v2/search/articles/journal%3A%22African%20Journal%20of%20Emergency%20Medicine%22%20AND%20_exists_%3Adoi%20AND%20_exists_%3Aabstract%20%20AND%20_exists_%3A%20%22bibjson.author%22%20%20
