In [1]:
import requests
import json
import pandas as pd
from io import StringIO
from tqdm import tqdm

In [2]:
def fetch_metadata():
    # url = f"https://visualization.osdr.nasa.gov/biodata/api/v2/dataset/{osd_id}/assay/*/sample/*/?format=json"
    # url = f"https://visualization.osdr.nasa.gov/biodata/api/v2/query/metadata/?investigation.study%20publications.study%20publication%20status=Published"
    url = "https://visualization.osdr.nasa.gov/biodata/api/v2/query/metadata/?investigation.study%20publications.study%20pubmed%20id"
    response = requests.get(url)
    response.raise_for_status()
    return response.text

data = fetch_metadata()

In [3]:
# Read into a pandas DataFrame
df = pd.read_csv(StringIO(data), dtype={"investigation.study publications.study pubmed id": str})

# Group by PubMed ID and get unique OSD IDs per group
grouped = df.groupby('investigation.study publications.study pubmed id')['id.accession'].unique().reset_index()

# Optionally, convert numpy arrays to lists
dict_data = grouped['id.accession'].apply(list).to_dict() # grouped['id.accession'] = 
print(dict_data)

{0: ['OSD-5'], 1: ['OSD-12'], 2: ['OSD-30'], 3: ['OSD-6'], 4: ['OSD-78'], 5: ['OSD-11'], 6: ['OSD-227'], 7: ['OSD-152'], 8: ['OSD-32'], 9: ['OSD-115'], 10: ['OSD-175'], 11: ['OSD-296'], 12: ['OSD-51'], 13: ['OSD-31', 'OSD-39'], 14: ['OSD-14'], 15: ['OSD-97'], 16: ['OSD-29'], 17: ['OSD-176'], 18: ['OSD-18'], 19: ['OSD-22'], 20: ['OSD-36'], 21: ['OSD-15'], 22: ['OSD-178'], 23: ['OSD-3'], 24: ['OSD-149'], 25: ['OSD-182'], 26: ['OSD-92'], 27: ['OSD-154'], 28: ['OSD-2', 'OSD-9'], 29: ['OSD-153'], 30: ['OSD-75'], 31: ['OSD-155'], 32: ['OSD-17'], 33: ['OSD-27'], 34: ['OSD-128', 'OSD-129'], 35: ['OSD-8'], 36: ['OSD-93'], 37: ['OSD-28'], 38: ['OSD-13'], 39: ['OSD-33'], 40: ['OSD-94'], 41: ['OSD-57'], 42: ['OSD-158'], 43: ['OSD-88'], 44: ['OSD-43'], 45: ['OSD-73'], 46: ['OSD-486'], 47: ['OSD-156'], 48: ['OSD-70'], 49: ['OSD-7'], 50: ['OSD-80'], 51: ['OSD-157'], 52: ['OSD-20'], 53: ['OSD-59'], 54: ['OSD-285'], 55: ['OSD-1'], 56: ['OSD-148'], 57: ['OSD-81'], 58: ['OSD-55', 'OSD-56'], 59: ['OSD-54'

In [4]:

def fetch_osd_overview() -> dict:
    url = "https://visualization.osdr.nasa.gov/biodata/api/v2/dataset/*/metadata/mission/?format=json"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()


def fetch_metadata(osd_id: str):
    url = f"https://visualization.osdr.nasa.gov/biodata/api/v2/dataset/{osd_id}/?format=json"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()[osd_id]

def fetch_all_osd():
    all_osd_data = {}
    osd_overview = fetch_osd_overview()
    for osd_id in tqdm(osd_overview.keys()):
        data = fetch_metadata(osd_id)
        all_osd_data[osd_id] = data

    return all_osd_data


In [5]:
data = fetch_all_osd()

  1%|          | 6/558 [00:04<07:27,  1.23it/s]


KeyboardInterrupt: 

In [None]:
import pickle 

def save():
    with open('data/cached_data/osd_data.pkl', 'wb') as f:
        pickle.dump(data, f)

def open():
    with open('data/cached_data/osd_data.pkl', 'rb') as f:
        loaded_dict = pickle.load(f)
    return loaded_dict


In [115]:
loaded_dict["OSD-1"]["metadata"]["study description"]
loaded_dict["OSD-48"]["metadata"]["study publication title"] # either list of strings (e.g. OSD-48) or single string
# Available information demo per study: https://visualization.osdr.nasa.gov/biodata/api/v2/dataset/OSD-48/?format=browser

['A microRNA signature and TGF-β1 response were identified as the key master regulators for spaceflight response.',
 'Validation of a New Rodent Experimental System to Investigate Consequences of Long Duration Space Habitation.',
 'Reproducible changes in the gut microbiome suggest a shift in microbial and host metabolism during spaceflight.',
 'Validation of Methods to Assess the Immunoglobulin Gene Repertoire in Tissues Obtained from Mice on the International Space Station.']