In [None]:
from netrc import netrc
from subprocess import Popen
from getpass import getpass
import platform
import os
import shutil

## Set up credential files (only run the first time)

In [None]:
# This code only needs to be run once, then can be skipped in future instances:
# This code comes from: https://disc.gsfc.nasa.gov/data-access#python-requests

if 1 == 1:
    urs = 'urs.earthdata.nasa.gov'    # Earthdata URL to call for authentication
    prompts = ['Enter NASA Earthdata Login Username \n(or create an account at urs.earthdata.nasa.gov): ',
               'Enter NASA Earthdata Login Password: ']

    homeDir = os.path.expanduser("~") + os.sep

    with open(homeDir + '.netrc', 'w') as file:
        file.write('machine {} login {} password {}'.format(urs, getpass(prompt=prompts[0]), getpass(prompt=prompts[1])))
        file.close()
    with open(homeDir + '.urs_cookies', 'w') as file:
        file.write('')
        file.close()
    with open(homeDir + '.dodsrc', 'w') as file:
        file.write('HTTP.COOKIEJAR={}.urs_cookies\n'.format(homeDir))
        file.write('HTTP.NETRC={}.netrc'.format(homeDir))
        file.close()

    print('Saved .netrc, .urs_cookies, and .dodsrc to:', homeDir)

    # Set appropriate permissions for Linux/macOS
    if platform.system() != "Windows":
        Popen('chmod og-rw ~/.netrc', shell=True)
    else:
        # Copy dodsrc to working directory in Windows  
        shutil.copy2(homeDir + '.dodsrc', os.getcwd())
        print('Copied .dodsrc to:', os.getcwd())

In [None]:
import numpy as np
import requests
from tqdm.auto import tqdm
from calendar import monthrange
from os import listdir
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import glob

## Perform Downloads

In [None]:
year = 2021
root_folder = 'C:\\data\\AIRS\\L1B\\'

sourceurl = "https://airsl1.gesdisc.eosdis.nasa.gov/data/Aqua_AIRS_Level1/AIRIBRAD.005/"
yearurl = sourceurl + str(year) + "/"

In [None]:
def bookends(year):
    # receives a year
    # returns a 12-item list of tuples, each is the # day start and month's number of days+1
    j = 0
    k = 1
    mon_ss = []
    for n in np.arange(1, 13, 1):
        j += k
        k = monthrange(year, n)[1]
        mon_ss.append((j, j+k))
    return mon_ss

def get_dl_pages(start_day, stop_day, yearurl):
    dailypages = []
    for n in np.arange(start_day, stop_day, 1):
        if n < 10:
            dailypages.append(yearurl + "00" + str(n) + "/")
        elif n < 100:
            dailypages.append(yearurl + "0" + str(n) + "/")
        else:    
            dailypages.append(yearurl + str(n) + "/")
    return dailypages

def get_links(baseurl):
    req = Request(baseurl)
    html_page = urlopen(req)

    soup = BeautifulSoup(html_page)#, "lxml")

    links = []
    for link in soup.findAll('a'):
        if 'L1B' in str(link):
            if 'xml' not in str(link):
                if 'jpg' not in str(link):
                    if 'map' not in str(link):
                        links.append(baseurl+link.get('href'))
    links = sorted([*{*links}])
    return links

def perform_download(URL, FOLDER):
    result = requests.get(URL)
    FILENAME = FOLDER + URL.rsplit('/')[-1]
    try:
        result.raise_for_status()
        f = open(FILENAME,'wb')
        f.write(result.content)
        f.close()
        #print('contents of URL written to '+FILENAME)
    except:
        print('requests.get() returned an error code '+str(result.status_code))

In [None]:
mon_ss = bookends(year)  # month_ss = 'Month Start & Stop'
bad_attempts = []
for month in tqdm(np.arange(1, 13, 1), "Month", ncols=400, position=0):
    if month > 9:
        n = str(month)+'\\'
    else:
        n = '0'+str(month)+'\\'
    save_path = root_folder+str(year) + '\\' + n
    
    if not os.path.isdir(save_path):
        os.makedirs(save_path)
            
    # Make dailypages list for the current month
    dailypages = get_dl_pages(mon_ss[month-1][0], mon_ss[month-1][1], yearurl)
    
    for page in tqdm(dailypages, "Day", ncols=400, position=1, leave=False):  # One website per day
        try:
            links = get_links(page)  # Grab all links from the one website (expect <=240 links)
        except:
            print("Can't access page", page, "please check if it exists.")
            break
        
        existing_files = glob.glob(save_path + '\\*.hdf')
        existing_files = [file.rsplit('\\')[-1] for file in existing_files]
        
        for link in links:
            if not link.rsplit('/')[-1] in existing_files:
                try:
                    perform_download(link, save_path)
                except:
                    print("Bad download attempt logged.")
                    bad_attempts.append((link, save_path))

print(year, 'complete. There are', len(bad_attempts), "files unsuccessfully attempted. Retrying...")
for link, save_path in bad_attempts:
    perform_download(link, save_path)

print('Retries complete. Done.')