In [None]:
import os

In [None]:
version = "1.0.2"
datadir = "data/"
archivedir = "data/Archive"

# Read the Journals which we are searching for


In [None]:
import pandas as pd

journals = pd.read_csv('doaj_journals.csv', sep="|")
journals.drop_duplicates(inplace=True)
import re
journals.tail

# Get the article details from DOAJ


In [None]:
def findjournaldata(results):
    publisher = None
    eissn = None
    pissn = None
    title = None
    doajid = None
    categories = None
    language = None
    notfound = True
    # iterate over the result and try to find where title exactly match
    for result in results:
            if result['bibjson']['title'].lower().strip() == journals.iloc[idx]['title'].lower().strip():
                notfound = False
                publisher = result['bibjson']['publisher']['name'] 
                if 'eissn' in result['bibjson'].keys():
                    eissn = result['bibjson']['eissn'] 
                if 'pissn' in result['bibjson'].keys():
                    pissn = result['bibjson']['pissn'] 

                title = result['bibjson']['title'] 
                doajid = result['id'] 
                categories =  "#".join(  [ s['term'].strip() for s in result['bibjson']['subject'] ] ) 
                language = "#".join(  [ s.strip() for s in result['bibjson']['language'] ] ) 
                
                break
    return notfound, publisher, eissn, pissn, title, doajid, categories, language

In [None]:
import requests
import urllib.parse
from time import sleep

publishers = []
eissns = []
pissns = []
titles = []
doajids = []
categoriess = []
languages = []
for idx in range(len(journals)):
    if 'doajid' not in journals.columns or journals.iloc[idx]['doajid'] is None or not isinstance(journals.iloc[idx]['doajid'], str):
        searchterm = urllib.parse.quote('bibjson.title:"'+journals.iloc[idx]['title']+'"')
        searchterm = 'https://doaj.org/api/v2/search/journals/'+searchterm
        x = requests.get(searchterm)
        notfound = True
        if x.status_code == 200:
            # iterate over pages
            jsonresult =  x.json()
            if 'total' in jsonresult.keys() :
                for page in range(int(jsonresult['total'])):
                    page = page+1
                    if 'results' in jsonresult.keys():                
                        notfound, publisher, eissn, pissn, title, doajid, categories, language = findjournaldata(jsonresult['results'])
                        if not notfound:
                            publishers.append(publisher)
                            eissns.append(eissn)
                            pissns.append(pissn)
                            titles.append(title)
                            doajids.append(doajid)
                            categoriess.append(categories)
                            languages.append(language)
                            break
                        else:
                            if jsonresult['total'] > 1:
                                # go to next page
                                x = requests.get(searchterm+"?page="+str(page)+"&pageSize=10")
                                print(searchterm+"?page="+str(page)+"&pageSize=10")
                                if x.status_code == 200:
                                    jsonresult =  x.json()
                                else:
                                    break
                
       
        if notfound:
            # test there is new page if yes go there        
            print("Error:", searchterm)
            publishers.append(None)
            eissns.append(None)
            pissns.append(None)
            titles.append(journals.iloc[idx]['title'])
            doajids.append(None)
            categoriess.append(None)
            languages.append(None)
        sleep(3)
    else:
        publishers.append(journals.iloc[idx]['publisher'])
        eissns.append(journals.iloc[idx]['eissn'])
        pissns.append(journals.iloc[idx]['pissn'])
        titles.append(journals.iloc[idx]['title'])
        doajids.append(journals.iloc[idx]['doajid'])
        categoriess.append(journals.iloc[idx]['categories'])
        languages.append(journals.iloc[idx]['language'])
    
# enrich journals
journals['publisher'] = publishers
journals['eissn'] = eissns
journals['pissn'] = pissns
journals['title'] = titles
journals['doajid'] = doajids
journals['categories'] = categoriess
journals['language'] = languages

journals.to_csv('doaj_journals.csv', sep="|", index=False)


In [None]:
print(journals.head())
journals.tail()

# Get the articles for the Journal

In [None]:
def findarticledata(results):
    
    # iterrate over Articles
    url = []
    title = []
    doi = []
    abstract  = []
    writers = []
    publishdate = []
    keyword = []
    for result in results:

        # url
        urlfound = False
        for l in result['bibjson']["link"]:
            if l['type'] == "fulltext":
                urlfound = True
                url.append(  l['url'].replace("\n", "") )
                break
        if not urlfound:
            # if there no url we do not interest about the Article
            url.append(None)
            title.append(None)
            doi.append(None)
            abstract.append( None )
            writers.append(None)
            publishdate.append( None )
            keyword.append( None )
            continue
                
            
        # title
        if "title" in  result['bibjson']:
            title.append( result['bibjson']["title"].replace("\n", "") )
        else:
            # if there no title we do not interest about the Article
            title.append(None)
            doi.append(None)
            abstract.append( None )
            writers.append(None)
            publishdate.append( None )
            keyword.append( None )
            continue
        
        # doi
        founddoi = False
        for ide in result['bibjson']['identifier']:
            if ide["type"].lower() == "doi":
                founddoi = True
                doi.append( ide['id'].replace("\n", "") )
                break
        if not founddoi:
            doi.append(None)
        
        # abstract
        if "abstract" in  result['bibjson']:
            abstract.append( result['bibjson']["abstract"].replace("\n", '<br>') )
        else:
            # if there no abstract we do not interest about the Article
            abstract.append( None )
            writers.append(None)
            publishdate.append( None )
            keyword.append( None )
            continue
        
        # writer
        writer = ""
        for w in result['bibjson']["author"]:
            try:
                writer = writer + "#" + w['name'].replace("\n", "")
                if 'affiliation' in w.keys():
                    writer = writer + "--" + w['affiliation'].replace("\n", "")
                if 'orcid_id' in w.keys():
                    writer = writer + "---" + w['orcid_id'].replace("\n", "")          
            except:
                pass
        if len(writer) > 0:
            writers.append(writer)
        else:
            writers.append(None)
            publishdate.append( None )
            keyword.append( None )
            continue
        
        # publishdate
        if "created_date" in result.keys():
            publishdate.append( result["created_date"].replace("\n", "") )
        else:
            publishdate.append( None )
        
        # keywords
        # we put there a filter as if there is too much keyword the are not informatiom
        if "keywords" in result['bibjson'] and len(result['bibjson']["keywords"]) < 15 :
            keytext = ""            
            for k in result['bibjson']["keywords"]:
                if len(k) < 100:
                    keytext = keytext + "#" + k.replace("\n", "")
            if len(keytext) == 0:
                keyword.append( None )
            else:
                keyword.append( keytext )
        else:
            keyword.append( None )

    df = pd.DataFrame({
        'url': url,
        'title': title,
        'doi': doi,
        'abstract': abstract,
        'writer': writers,
        'publishdate': publishdate,
        'keyword': keyword,
                      })
        
    return df

In [None]:
def itterate_over_results(searchterm, jsonresult, journals, journaldf, desc=True):
    if 'total' in jsonresult.keys() :                            
            for page in range(pagenumber):
                if 'results' in jsonresult.keys():   
                    thispage = findarticledata(jsonresult['results'])
                    thispage['journal_title'] = journals.iloc[idx]['title']
                    thispage['journal_eissn'] = journals.iloc[idx]['eissn']
                    thispage['journal_pissn'] = journals.iloc[idx]['pissn']
                    thispage['category'] = journals.iloc[idx]['categories']
                    
                    journaldf = pd.concat([journaldf, thispage])                    
                # go to othe next page
                if jsonresult['total'] > 1 and page != pagenumber-1 :
                    sleep(3)
                    # go to next page
                    x = None
                    if desc:
                        x = requests.get(searchterm+"?page="+str(page+2)+"&pageSize=100&sort=bibjson.year:desc")
                    else:
                        x = requests.get(searchterm+"?page="+str(page+2)+"&pageSize=100&sort=bibjson.year:asc")
                    if x.status_code == 200:
                        jsonresult =  x.json()
                    else:
                        break
                        
    return journaldf

In [None]:
import csv
import numpy as np

# iterate over the journals
for idx in range(len(journals)): 
    print("load:", journals.iloc[idx]['title'], "Ready: ", str(np.round(100*idx/len(journals),2))+"%", end='\r')
    # searchtext = 'journal:"'+journals.iloc[idx]['title']+'" AND _exists_:doi AND _exists_:abstract  AND _exists_: "bibjson.author" '
    searchtext = 'journal:"'+journals.iloc[idx]['title']+'" AND _exists_:abstract  AND _exists_: "bibjson.author" '
    # add issn
    if isinstance(journals.iloc[idx]['eissn'], str):
        searchtext = searchtext + ' AND issn: "'+journals.iloc[idx]['eissn']+'"'
    else:
        if isinstance(journals.iloc[idx]['pissn'], str):
            searchtext = searchtext + ' AND issn: "'+journals.iloc[idx]['pissn']+'"'        
    searchterm = urllib.parse.quote(searchtext)
    searchterm = 'https://doaj.org/api/v2/search/articles/'+searchterm
    x = requests.get(searchterm +'?page=1&pageSize=100&sort=bibjson.year:desc')    
    notfound = True
    
    journaldf  = pd.DataFrame({
        'url': [],
        'title': [],
        'doi': [],
        'abstract': [],
        'writer': [],
        'publishdate': [],
        'keyword': [],
        'journal_title': [],
        'journal_eissn': [],
        'journal_pissn': [],
        'category': []
                      })
    
    if x.status_code == 200:
        # is the dataset fit in the DOAJ 1000 Article filter?
        reruninasc = False
        # there is more than 2000 article in the Journal
        middlepart = False
        middlestart = None
        middleend = None
        # iterate over pages
        jsonresult =  x.json()
        if 'total' in jsonresult.keys() :
            pagenumber = int(jsonresult['total']//100)+1
            if jsonresult['total'] > 1000 :                
                reruninasc = True
            if jsonresult['total'] > 2000 :
                # we need get Articles between
                middlepart = True
            journaldf = itterate_over_results(searchterm, jsonresult, journals, journaldf, desc=True)
            # which youear finished the first 1000
            middlestart = int(min(list(set(
                [ x.split("-")[0] for x  in journaldf['publishdate'].values.tolist() if x is not None ])) ))
            
        if reruninasc:
            sleep(3)
            x = requests.get(searchterm+"?page=1&pageSize=100&bibjson.title&sort=bibjson.year:asc")
            if x.status_code == 200:
                jsonresult =  x.json()
                journaldf = itterate_over_results(searchterm, jsonresult, journals, journaldf, desc=False)
                
        # itterate over untill we get new Article
        #  not working fully if there is a year when mpre than 2000 article was publishd
        if middlepart:
            # itterate over the years
            while True:
                yearsearchtext = searchtext + ' AND bibjson.year: "'+str(middlestart)+'"'
                searchterm = urllib.parse.quote(yearsearchtext)
                searchterm = 'https://doaj.org/api/v2/search/articles/'+searchterm
                x = requests.get(searchterm+'?page=1&pageSize=100&bibjson.title&sort=bibjson.year:asc')
                if x.status_code == 200:
                    jsonresult =  x.json()
                    if 'total' in jsonresult.keys() :
                        if jsonresult['total'] == 0:
                            break
                        if jsonresult['total'] > 1000 :
                            reruninasc = True
                        if jsonresult['total'] > 2000 :
                            print("Can not download all article:",
                                  searchterm+'?page=1&pageSize=100&bibjson.title&sort=bibjson.year:asc' )
                    else:
                        break
                    journaldf = itterate_over_results(searchterm+'AND bibjson.year: "'+str(middlestart)+'"', jsonresult, journals, journaldf, desc=False)
                middlestart = middlestart-1   
                
                if reruninasc:
                    sleep(3)
                    x = requests.get(searchterm+"?page=1&pageSize=100&bibjson.title&sort=bibjson.year:asc")
                    if x.status_code == 200:
                        jsonresult =  x.json()
                        journaldf = itterate_over_results(searchterm, jsonresult, journals, journaldf, desc=False)                    

            
    # drop duplicates
    journaldf.drop_duplicates(inplace=True)
    
    if len(journaldf) > 0:
        # save journaldf
        journaldf.to_pickle(os.path.join(
            datadir,
            'journal_'+journals.iloc[idx]['title'].replace(" ", "_").replace("&", "and")+"_"+version+".pandas" )
                       )
    print("Downloaded", len(journaldf), "article for ", journals.iloc[idx]['title'] ,"!")
    sleep(10)