# Process Data from REST API
## Get List of data sources

In [None]:
import requests, os
from requests.utils import requote_uri
urlString= 'http://api.scholexplorer.openaire.eu/v1/listDatasources'
r = requests.get(urlString)

#Create a URL encoded list of data sources
dataSources=[];
for ds in r.json():
    if len(ds.strip())>0:
        dataSources.append(requote_uri(ds))
        
        
#Print name of datasources
count=0
for ds in dataSources:
    print('{}. {}'.format(count,ds))
    count=count+1
    
print('We found {} data sources.'.format(count))

## Download files for a given datasource

In [None]:
SelectedDataSources={5,6,7,8,9,10,11,12,13,14,15}

#Create local folder for the data sources
for i in SelectedDataSources:
    ds=dataSources[i]
    if not os.path.exists(ds):
        print ('Creating new folder: {}'.format(ds))
        os.makedirs(ds)
    else:
        print ('We found a local folder for: {}'.format(ds))

In [None]:
#Download
downloadFiles=True
maximumPages=1000000

if downloadFiles:
    for i in SelectedDataSources:
        ds=dataSources[i]
        page=0
        statusCode=200
        while (statusCode==200 and page < maximumPages):
            urlString= 'https://api-dliservice-prototype-dli.d4science.org/v1/linksFromDatasource?datasource={}&page={}'.format(ds,page)
            r = requests.get(urlString)
            fileName='{}.json'.format(1000000+page)
            myfile = open('./{}/{}'.format(ds,fileName), 'w')
            myfile.write(r.text)
            statusCode= r.status_code
            page = page + 1
            myfile.close
            if page%10==0:
                print('We have downloaded {} files for {}'.format(page,ds))
    print('Download is complete')
else:
    print('Download is disabled')

## Create CSV from JSON files

### Read Schemas

In [None]:
import glob, json, csv, pprint

nodes={}
relations=[]
schemas=[]

def addToSchemas(identifiers):
    for i in identifiers:
        if i['schema'] not in schemas:
            print(i['schema'])
            schemas.append(i['schema'])

for i in SelectedDataSources:
        ds=dataSources[i]
        path = ds
        dirs = os.listdir(path)
        path = '{}/*.json'.format(ds)
        for fname in glob.glob(path):
            #print(fname)
            data = json.load(open(fname))
            for l in data:
                addToSchemas(l['source']['identifiers'])
                addToSchemas(l['target']['identifiers'])

print('Done!')

### Read JSON files

In [None]:
import glob, json, csv, pprint

nodes={}
relations=[]

def addNode(node,label,fname):
    row={'title':'','label':label,'fname':fname}

    try:
        row['title']=node['title']
    except Exception:
        pass
                
    for i in node['identifiers']:
        if i['schema']=='dnetIdentifier':
            row['local_id']=i['identifier']
        else:
            row[i['schema']]=i['identifier']    

    if row['local_id'] in nodes:
        if 'doi' in row:
            nodes[row['local_id']]=row
    else:
        nodes[row['local_id']]=row     

    return row['local_id']

 


for i in SelectedDataSources:
        ds=dataSources[i]
        path = ds
        dirs = os.listdir(path)
        path = '{}/*.json'.format(ds)
        
        for fname in glob.glob(path):
            print(fname)
            data = json.load(open(fname))
            
            for l in data:                
                label=''
                for p in l['linkProvider']:
                    if label!='':
                        label= label +';'
                    label= label + p['name']

                source_id=addNode(l['source'],label,fname)                                                                 
                target_id=addNode(l['target'],label,fname)

                relationship_type=l['relationship']['name']        
                relation_row=[source_id,target_id,relationship_type]
                if relation_row not in relations:   
                    relations.append(relation_row)

print('Done!')

### Create nodes.csv

In [None]:
nodeFile=open('nodes.csv','w')
nodeWriter = csv.writer(nodeFile, quoting=csv.QUOTE_ALL)
nodeWriter.writerow (['key:ID','title','doi','uri','url','local','handle','pmid','icpsr','pdb','pubmedid','genbank','geo','embl','ensembl','issn','purl','isbn','orcid','json',':LABEL'])

for row in nodes.values():
    doi=row.get('doi','')
    uri=row.get('uri','')
    url=row.get('url','')
    local=row.get('local','') 
    handle=row.get('handle','') 
    pmid=row.get('pmid','') 
    icpsr=row.get('icpsr','') 
    pdb=row.get('pdb','') 
    pubmedid=row.get('pubmedid','')
    genbank=row.get('genbank','')
    geo=row.get('geo','')
    embl=row.get('embl','')
    ensembl=row.get('ensembl','')
    issn=row.get('issn','')
    purl=row.get('purl','')
    isbn=row.get('isbn','')
    orcid=row.get('orcid','')
    nodeWriter.writerow ([row['local_id'],row['title'],doi,uri,url,local,handle,pmid,icpsr,pdb,pubmedid,'genbank','geo','embl','ensembl','issn','purl','isbn','orcid,row['fname'],row['label']])
    
nodeFile.close()
print('Done!')

### Create relations.csv

In [None]:
nodeFile=open('relations.csv','w')
nodeWriter = csv.writer(nodeFile, quoting=csv.QUOTE_ALL)
nodeWriter.writerow ([':START_ID',':END_ID',':TYPE'])

for row in relations:
    nodeWriter.writerow(row)
    
nodeFile.close()
print('Done!')

# Process Bulk Data

## Process DOI links

In [14]:
import json, pprint
fname = 'Datasets%20in%20Datacite/1000004.json'
data = json.load(open(fname))

dois=[]

def processNode(node):
    for i in node['identifiers']:
        if i['schema']=='doi':
            identifier=i['identifier']
            if identifier not in dois:
                dois.append(identifier)

for l in data:
    processNode(l['source'])
    processNode(l['target'])
    
    
pprint.pprint (dois)


['10.17182/hepdata.52790.v1/t323',
 '10.1594/pangaea.253924',
 '10.2973/odp.proc.ir.122.1990',
 '10.1594/pangaea.293975',
 '10.1594/pangaea.149291',
 '10.1594/pangaea.767698',
 '10.5517/ccp2n2g',
 '10.1016/j.ica.2007.03.022',
 '10.1594/pangaea.358496',
 '10.1029/93pa03301',
 '10.5061/dryad.26pm4/1',
 '10.5061/dryad.26pm4',
 '10.5517/cc7sf6m',
 '10.1002/anie.200454130',
 '10.5517/cc1jsbf2',
 '10.1021/jacs.5b09853',
 '10.17182/hepdata.42789.v1/t1',
 '10.17182/hepdata.42789.v1',
 '10.17182/hepdata.42789',
 '10.5063/aa/dpennington.307.2',
 '10.5517/cc5w6fn',
 '10.1107/s0108270101013506',
 '10.5517/cc9rj7s',
 '10.1107/s1600536807024944',
 '10.17182/hepdata.19211.v1/t3',
 '10.17182/hepdata.19211',
 '10.17182/hepdata.41866.v1/t1',
 '10.17182/hepdata.41866.v1',
 '10.17182/hepdata.35328.v1/t2',
 '10.5517/cc1jlz7b',
 '10.1039/c5dt03387j',
 '10.1594/pangaea.221073',
 '10.2973/dsdp.proc.96.1986',
 '10.5517/cc87kcd',
 '10.1039/b501437a',
 '10.1594/pangaea.847482',
 '10.1594/pangaea.847494',
 '10.52