There is a LOT of CReSIS data. 

I need to move towards creating an index from filename -> URL for the eventual QGIS layer, so might as well start with that now.

In [9]:
cresis_url = "https://data.cresis.ku.edu/data/rds"

In [48]:
from bs4 import BeautifulSoup   # For parsing html and extracting the links
import pathlib
import re
import requests  # For downloading index page
import subprocess  

In [43]:
reqs = requests.get(cresis_url)
soup = BeautifulSoup(reqs.text, 'html.parser')

all_urls = [link.get('href') for link in soup.find_all('a')]
# print(all_urls)
# download_urls = [base_url + url for url in all_urls if url.startswith(prefix)]
#filenames = [url.strip(base_url+prefix).split('?')[0] for url in download_urls]

# Each campaign seems to have two links, but we only want one.
campaigns = set()
for link in soup.find_all('a'):
    href = link.get('href')
    try:
        year = int(href[0:4])  # valid campaign names start with YYYY.
    except:
        continue
    campaign = href.strip('/')
    campaigns.add(campaign)

campaigns = list(campaigns)
campaigns.sort()

# TODO: Fix this terrible nested dictionary 
cresis_datafiles = {"ANTARCTIC":{}, "ARCTIC":{}}

for campaign in campaigns:
    print()
    print(campaign)
    if "Antarctica" in campaign:
        region = "ANTARCTIC"
    else:
        region = "ARCTIC"
    cresis_datafiles[region][campaign] = {}
        
    campaign_url = "{}/{}".format(cresis_url, campaign)
    reqs = requests.get(campaign_url)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    dirs = [link.get('href').strip('/') for link in soup.find_all('a')]
    # From their README: "The standard L1B files are, in order of increasing quality
    # CSARP_qlook, CSARP_csarpcombined, CSARP_standard, and CSARP_mvdr directories. 
    product_priorities = ["CSARP_mvdr", "CSARP_standard", "CSARP_csarp-combined", "CSARP_qlook"]
    product = None
    for pp in product_priorities:
        if pp in dirs:
            product = pp
            break
    print("Product: {}".format(product))
    cresis_datafiles[region][campaign][product] = {}

            
    product_url = "{}/{}".format(campaign_url, product)
    reqs = requests.get(product_url)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    segments = set()
    for link in soup.find_all('a'):
        href = link.get('href').strip('/')
        try:
            date, seg = map(int, href.split('_'))
            segments.add(href)
        except:
            continue
        
    segments = list(segments)
    segments.sort()
    
    for segment in segments:
        segment_url = "{}/{}".format(product_url, segment)
        reqs = requests.get(segment_url)
        soup = BeautifulSoup(reqs.text, 'html.parser')
        files = set()
        combined_regex = "Data_[0-9]{8}_[0-9]{2}_[0-9]{3}.mat"
        for link in soup.find_all('a'):
            href = link.get('href').strip('/')
            if re.match(combined_regex, href) is not None:
                files.add(href)
        files = list(files)
        files.sort()
        
        cresis_datafiles[region][campaign][product][segment] = {file: "{}/{}".format(segment_url, file) for file in files}



1993_Greenland_P3
Product: CSARP_standard

1995_Greenland_P3
Product: CSARP_standard

1996_Greenland_P3
Product: CSARP_standard

1997_Greenland_P3
Product: CSARP_standard

1998_Greenland_P3
Product: CSARP_standard

1999_Greenland_P3
Product: CSARP_standard

2001_Greenland_P3
Product: CSARP_standard

2002_Antarctica_P3chile
Product: CSARP_standard

2002_Greenland_P3
Product: CSARP_standard

2003_Greenland_P3
Product: CSARP_standard

2004_Antarctica_P3chile
Product: CSARP_standard

2005_Antarctica_GPRWAIS
Product: CSARP_standard

2005_Greenland_TO
Product: CSARP_standard

2006_Greenland_TO
Product: CSARP_mvdr

2007_Greenland_P3
Product: CSARP_standard

2008_Greenland_Ground
Product: CSARP_mvdr

2008_Greenland_TO
Product: CSARP_mvdr

2009_Antarctica_DC8
Product: CSARP_mvdr

2009_Antarctica_TO
Product: CSARP_standard

2009_Antarctica_TO_Gambit
Product: CSARP_standard

2009_Greenland_TO
Product: CSARP_mvdr

2010_Antarctica_DC8
Product: CSARP_mvdr

2010_Greenland_DC8
Product: CSARP_mvdr

20

In [None]:
# TODO: Want to save the index, because simply creating it takes quite a while.

In [44]:
import pickle
with open("cresis_datafiles.pkl", 'wb') as fp:
    pickle.dump(cresis_datafiles, fp)

In [None]:
# NB: looks like combined is CSARP_csarp-combined and not CSARP_csarpcombined. So far, only relevant for 2011_Greenland_{P3, TO}

In [51]:
for region, campaigns in cresis_datafiles.items():
    print(region)
    for campaign, products in campaigns.items():
        print(campaign)
        print(".. {}".format(product))
        for product, segments in products.items():
            for segment, frames in segments.items():
                print(".... {}".format(segment))
                dest_dir = "/Volumes/RadarData/{}/CRESIS/{}/{}/{}".format(region, campaign, product, segment)
                try:
                    pp = pathlib.Path(dest_dir)
                    pp.mkdir(parents=True, exist_ok=True)
                except FileExistsError as ex:
                    raise Exception("Could not create {}.".format(dest_dir))
                    continue
    
                for frame, data_url in frames.items():
                    wget_cmd = 'wget -c --directory-prefix="{}" "{}"'.format(dest_dir, data_url) 
                    print(wget_cmd)
                    output = subprocess.getoutput(wget_cmd)
                    #break  # we're just testing right now ... don't grab everything!
                break


ANTARCTIC
2002_Antarctica_P3chile
.. CSARP_standard
.... 20021126_01
wget -c --directory-prefix="/Volumes/RadarData/ANTARCTIC/CRESIS/2002_Antarctica_P3chile/CSARP_standard/20021126_01" "https://data.cresis.ku.edu/data/rds/2002_Antarctica_P3chile/CSARP_standard/20021126_01/Data_20021126_01_001.mat"
wget -c --directory-prefix="/Volumes/RadarData/ANTARCTIC/CRESIS/2002_Antarctica_P3chile/CSARP_standard/20021126_01" "https://data.cresis.ku.edu/data/rds/2002_Antarctica_P3chile/CSARP_standard/20021126_01/Data_20021126_01_002.mat"
wget -c --directory-prefix="/Volumes/RadarData/ANTARCTIC/CRESIS/2002_Antarctica_P3chile/CSARP_standard/20021126_01" "https://data.cresis.ku.edu/data/rds/2002_Antarctica_P3chile/CSARP_standard/20021126_01/Data_20021126_01_003.mat"
wget -c --directory-prefix="/Volumes/RadarData/ANTARCTIC/CRESIS/2002_Antarctica_P3chile/CSARP_standard/20021126_01" "https://data.cresis.ku.edu/data/rds/2002_Antarctica_P3chile/CSARP_standard/20021126_01/Data_20021126_01_004.mat"
wget -c --d

In [6]:
https://data.cresis.ku.edu/data/rds/1993_Greenland_P3/

['__and__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__iand__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__isub__',
 '__iter__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__or__',
 '__rand__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__ror__',
 '__rsub__',
 '__rxor__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__xor__',
 'add',
 'clear',
 'copy',
 'difference',
 'difference_update',
 'discard',
 'intersection',
 'intersection_update',
 'isdisjoint',
 'issubset',
 'issuperset',
 'pop',
 'remove',
 'symmetric_difference',
 'symmetric_difference_update',
 'union',
 'update']