In [1]:
import csv

import os
import os.path
import pathlib
import subprocess
import tempfile

In [3]:
import requests  # For downloading index page
from bs4 import BeautifulSoup   # For parsing html and extracting the links

In [2]:
# This will be a configuration parameter for the eventual plugin
radargram_dir = "/Volumes/QIceRadar1"

In [None]:
# Hack to be able to debug plotting without an external drive
radargram_dir = "/Users/lindzey/Documents/QIceRadar/data_radargrams"

## Data Download

BAS has an excellent online interface for viewing its available data: 
https://www.bas.ac.uk/project/nagdp/

Be sure to cite Fremand et al. 2022 (https://doi.org/10.5194/essd-14-3379-2022) when talking about releasing data.

This provides a human-usable interface for downloading the data. But, we want to be able to automate it! 
The portal shows quicklook images corresponding to chunks of lines, though the underlying data files appear to be one-per-flight.


In [3]:
institution = "BAS"
campaign = "AGAP"
region = "ANTARCTIC"
dest_dir = f"{radargram_dir}/{region}/{institution}/{campaign}"

url = "https://ramadda.data.bas.ac.uk:443/repository/entry/get/AGAP_A19.nc?entryid=synth%3Aa1abf071-85fc-4118-ad37-7f186b72c847%3AL25ldGNkZi9BR0FQX0ExOS5uYw%3D%3D"
dest_filename = "AGAP_A19.nc"
dest_filepath = "{}/{}".format(dest_dir, dest_filename)
wget_cmd = ["wget", "--no-clobber", "--quiet", "--output-document", "/Users/lindzey/Downloads/foo", url]



In [4]:
subprocess.check_call(wget_cmd)

0

In [14]:
# Trying to find the list of granules without manually downloading an index
# So far, no luck -- I can't figure out how to get BeautifulSoup
# to expand the "netcdf" drop down and see links below it.
agap_doi = "a1abf071-85fc-4118-ad37-7f186b72c847"
doi = f"https://doi.org/10.5285/{agap_doi}"
data_link = "https://ramadda.data.bas.ac.uk/repository/entry/show?entryid=synth:a1abf071-85fc-4118-ad37-7f186b72c847:L25ldGNkZg=="
# data_link = 'https://ramadda.data.bas.ac.uk/repository/entry/show?entryid=' + doi.split('/')[-1]
reqs = requests.get(data_link)
soup = BeautifulSoup(reqs.text, 'html.parser')

base_url = 'https://ramadda.data.bas.ac.uk'
prefix = '/repository/entry/show/'

all_urls = [link.get('href') for link in soup.find_all('a')]
download_urls = [base_url + url for url in all_urls if url is not None and url.startswith(prefix)]
filenames = [url.strip(base_url+prefix).split('?')[0] for url in download_urls]


In [15]:
all_urls

['/repository',
 None,
 '#',
 'https://eds.ukri.org/',
 'https://www.bas.ac.uk',
 'https://data.bas.ac.uk',
 '/repository/entry/show?entryid=61100714-1e32-44af-a237-0a517529bc49',
 '/repository/entry/show?entryid=451eb7be-b895-4535-aa48-b6411723b407',
 '/repository/entry/show?entryid=a1abf071-85fc-4118-ad37-7f186b72c847',
 None,
 '/repository/entry/show?entryid=synth%3Aa1abf071-85fc-4118-ad37-7f186b72c847%3AL25ldGNkZg%3D%3D',
 None,
 '/repository/search/type/group',
 '/repository/search/type/group?user_id=localuser&search.submit=true',
 'https://www.bas.ac.uk/pdcfeedback',
 'https://www.bas.ac.uk',
 'https://www.ukri.org',
 'http://www.nationalarchives.gov.uk/doc/open-government-licence',
 'http://www.nationalarchives.gov.uk/doc/open-government-licence',
 'https://www.bas.ac.uk/about-this-site/privacy-cookie-policy/',
 'https://www.bas.ac.uk/about-this-site/copyright-statement/',
 'https://www.bas.ac.uk/about-this-site/privacy-cookie-policy/']

In order to get the mapping from filename -> download link:
* Go to the landing page. e.g.:
  * AGAP: https://ramadda.data.bas.ac.uk/repository/entry/show?entryid=a1abf071-85fc-4118-ad37-7f186b72c847
  * BBAS:
  * ICEGRAV: 
  * POLARGAP: https://ramadda.data.bas.ac.uk/repository/entry/show?entryid=e8a29fa7-a245-4a04-8b56-098defa134b9
* click the "Folder" dropdown next to "netcdf"
* Select "All Actions"
* Select "CSV"; download into data/BAS/netcdf_indices; renaming file to [survey].csv
* Relevant columns are "name" and "url"

Their filename convention is `[survey]_[flight].nc`; all of my references to survey name should match that one.

In [None]:
root_dir = "/Users/lindzey/Documents/QIceRadar"
index_dir = "{}/radar_wrangler/data".format(root_dir)

institution = "BAS"

bas_index_dir = "{}/{}".format(index_dir, institution)
surveys = [ff for ff in os.listdir("{}/{}".format(index_dir, institution)) if ff.endswith(".csv")]
surveys.sort()
print(surveys)

for ff in surveys:
    survey = ff.split('.')[0]
    filepath = "{}/{}".format(bas_index_dir, ff)

    with open(filepath) as csvfile:
        csv_reader = csv.DictReader(csvfile)
        for flight in csv_reader:
            print("{} {}: {}".format(survey, flight['name'], flight['url']))
