In [2]:
import requests
import pandas as pd
import numpy as np

In [79]:
# Define the API endpoint URL
api_url = "https://api.openaire.eu/search/datasets?format=json"
# url = "https://api.openaire.eu/search/publications?format=json"

# list to gather all data through API requests
soilwise_data_json = []


# loop through pages to retreive next part of data
for i in range(0,10):
    print(f'loop num {i}')
    params = {
        'page':f'{i}',
        'size':100
    }
    headers = {"Content-Type": "application/json"}

    # Make the API request
    response = requests.get(api_url, params=params, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        soilwise_data_json = soilwise_data_json + data['response']['results']['result']
        # print(f"id of first item extracted in this loop is {data['response']['results']['result'][0]['header']['dri:objIdentifier']}")
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")

# soilwise_data_json

loop num 0
loop num 1
loop num 2
loop num 3
loop num 4
loop num 5
loop num 6
loop num 7
loop num 8
loop num 9


In [87]:
# Extract the relevant fields for each item
extracted_data = []

# Iterate through the JSON and extract the fields needed for each item
for item in soilwise_data_json:
    try:
        obj_identifier = item['header']['dri:objIdentifier']['$']
    except (KeyError, TypeError):
        obj_identifier = None

    try:
        collectedfrom_1 = item['metadata']['oaf:entity']['oaf:result']['collectedfrom']['@name']
    except (KeyError, TypeError):
        collectedfrom_1 = None

    try:
        original_id_list = item['metadata']['oaf:entity']['oaf:result']['originalId']
        original_id = next((id['$'] for id in original_id_list if not id['$'].startswith('50|')), None)
    except (KeyError, TypeError):
        original_id = None

    try:
        children_instance = item['metadata']['oaf:entity']['oaf:result']['children']['instance']
    except (KeyError, TypeError):
        children_instance = None

    if children_instance:
        try:
            collectedfrom_2 = children_instance['collectedfrom']['@name']
        except (KeyError, TypeError):
            collectedfrom_2 = None

        try:
            hostedby = children_instance['hostedby']['@name']
        except (KeyError, TypeError):
            hostedby = None

        try:
            webresource = children_instance['webresource']['url']['$']
        except (KeyError, TypeError):
            webresource = None
    else:
        collectedfrom_2 = None
        hostedby = None
        webresource = None

    extracted_item = {
        'objIdentifier': obj_identifier,
        'collectedfrom_1': collectedfrom_1,
        'originalId': original_id,
        'collectedfrom_2': collectedfrom_2,
        'hostedby': hostedby,
        'webresource': webresource,
    }
    
    extracted_data.append(extracted_item)

# Convert to DataFrame
soilwise_data_df = pd.DataFrame(extracted_data)

# checking for dupicates:
print(f"number of rows of dataframe is {soilwise_data_df.shape[0]}")
soilwise_data_df = soilwise_data_df.drop_duplicates(subset=['objIdentifier'])
print(f"number of rows of dataframe after dropping duplicates is {soilwise_data_df.shape[0]}")


# Set display options to show more rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)


soilwise_data_df

number of rows of dataframe is 1000
number of rows of dataframe after dropping duplicates is 900


Unnamed: 0,objIdentifier,collectedfrom_1,originalId,collectedfrom_2,hostedby,webresource
0,_____OmicsDI::47167d2e7a363dcb907e77d4a5c948d7,Omics Discovery Index (OmicsDI),GPM11210027561,Omics Discovery Index (OmicsDI),The Global Proteome Machine Database,https://www.omicsdi.org/dataset/gpmdb/GPM11210027561
1,_____OmicsDI::792363a89655f77bf646412bacd4a0b5,Omics Discovery Index (OmicsDI),PRJNA267992,Omics Discovery Index (OmicsDI),European Nucleotide Archive,https://www.omicsdi.org/dataset/omics_ena_project/PRJNA267992
2,_____OmicsDI::f150e00a42aa1c6f43504f276beedf5b,Omics Discovery Index (OmicsDI),GSE63974,Omics Discovery Index (OmicsDI),Gene Expression Omnibus,https://www.omicsdi.org/dataset/geo/GSE63974
3,_____OmicsDI::f4f55d8843432ec034161d0bcc79c6fb,Omics Discovery Index (OmicsDI),PRJNA457015,Omics Discovery Index (OmicsDI),European Nucleotide Archive,https://www.omicsdi.org/dataset/omics_ena_project/PRJNA457015
4,dedup_wf_002::baf0267137d04a1f850ce4a89cac7e05,CoCoON,oai:crdo.vjf.cnrs.fr:cocoon-1957a58b-fea8-4f71-97a5-8bfea8ff71d8,,,
5,doi_________::01919e79681b7405fad0c3edbab9008b,Datacite,10.15468/dl.2xp1sf,Datacite,Global Biodiversity Information Facility,https://doi.org/10.15468/dl.2xp1sf
6,doi_________::01fb9ec7b299f8bf012623bd57d4691c,Datacite,10.17188/1653258,Datacite,Unknown Repository,https://doi.org/10.17188/1653258
7,doi_________::0404974cb010f47bdcf6e973957f2691,Datacite,10.15156/bio/sh1235243.09fu,Datacite,Unknown Repository,https://doi.org/10.15156/bio/sh1235243.09fu
8,doi_________::0679ad4c93a057cdd6a8359036d14ee1,,10.17989/encsr534zym,,,https://doi.org/10.17989/encsr534zym
9,doi_________::0a624c66f32cb9de9d737fdf1c7df4ab,Datacite,10.13140/rg.2.2.18213.91362,Datacite,ResearchGate Data,https://doi.org/10.13140/rg.2.2.18213.91362


In [None]:
soilwise_data_df.

In [89]:
# Export the DataFrame to a CSV file
soilwise_data_dfsoilwise_data_df.to_csv('soilwise_openaire_doi.csv', sep='\t', index=False)