In [None]:
import os
from multiprocessing import Pool

import numpy as np
import pandas as pd
import pyhdf
import requests
from pyhdf.SD import SD
from pyhdf.error import HDF4Error
from tqdm.auto import tqdm
from calendar import monthrange
from os import listdir
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

In [None]:
year = 2003
root_folder = 'C:\\data\\AIRS\\L1B\\'

sourceurl = "https://airsl1.gesdisc.eosdis.nasa.gov/data/Aqua_AIRS_Level1/AIRIBRAD.005/"
yearurl = sourceurl + str(year) + "/"

In [None]:
class SessionWithHeaderRedirection(requests.Session):
    """Overrides requests.Session.rebuild_auth to maintain headers when redirected"""

    AUTH_HOST = 'urs.earthdata.nasa.gov'

    def __init__(self, username, password):
        super().__init__()
        self.auth = (username, password)

    def rebuild_auth(self, prepared_request, response):
        headers = prepared_request.headers
        url = prepared_request.url

        if 'Authorization' in headers:
            original = requests.utils.urlparse(response.request.url).hostname
            redirect = requests.utils.urlparse(url).hostname

            if original != redirect and redirect != self.AUTH_HOST and original != self.AUTH_HOST:
                del headers['Authorization']
        return

In [None]:
def bookends(year):
    # receives a year
    # returns a 12-item list of tuples, each is the # day start and month's number of days+1
    j = 0
    k = 1
    mon_ss = []
    for n in np.arange(1, 13, 1):
        j += k
        k = monthrange(year, n)[1]
        mon_ss.append((j, j+k))
    return mon_ss

def get_dl_pages(start_day, stop_day, yearurl):
    dailypages = []
    for n in np.arange(start_day, stop_day, 1):
        if n < 10:
            dailypages.append(yearurl + "00" + str(n) + "/")
        elif n < 100:
            dailypages.append(yearurl + "0" + str(n) + "/")
        else:    
            dailypages.append(yearurl + str(n) + "/")
    return dailypages

def get_files(path):
    filenames = [f for f in listdir(path) if isfile(join(path, f))]
    return filenames

def get_links(baseurl):
    #baseurl = "https://airsl1.gesdisc.eosdis.nasa.gov/data/Aqua_AIRS_Level1/AIRIBRAD.005/2003/002/"
    req = Request(baseurl)
    html_page = urlopen(req)

    soup = BeautifulSoup(html_page)#, "lxml")

    links = []
    for link in soup.findAll('a'):
        if 'L1B' in str(link):
            if 'xml' not in str(link):
                if 'jpg' not in str(link):
                    if 'map' not in str(link):
                        links.append(baseurl+link.get('href'))
    links = sorted([*{*links}])
    return links

def perform_download(url, output_dir, username, password):
    with SessionWithHeaderRedirection(username, password) as http_client:
        try:
            response = http_client.get(url, stream=True, timeout=10)
            response.raise_for_status()  # raise an exception in case of http errors

            filename = os.path.join(output_dir, url.split('/')[-1])
            with open(filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024 * 1024):
                    f.write(chunk)

        except requests.exceptions.HTTPError:
            raise
        except Exception as e:
            raise Exception('Unhandled exception: %s' % e)

def download_files(urls, count_callback=None):
    """Downloads all necessary files that are not yet stored on the disk using multiple processes.
    download_finished_callback and error_callback cannot be bound or unbound methods in Windows, so pass
    functions instead.
    """
    #urls = list(self.filter_files(urls, self._storage_directory))

    if count_callback is not None:
        count_callback(len(urls))

    #process_args = [(url, self._storage_directory, self._username, self._password) for url in urls]
    process_args = [(url, 'C:\\data\\AIRS\\L1B\\', 'USERNAME', 'PASSWORD') for url in urls]

    with Pool(processes=5) as pool:
        pool.starmap_async(perform_download, process_args, callback=lambda x: None, error_callback=lambda x: None)
        pool.close()
        pool.join()

    # check if any files failed to download, and return false if so
    #urls = list(self.filter_files(urls, self._storage_directory))
    #return len(urls) == 0

In [None]:
mon_ss = bookends(year)  # month_ss = 'Month Start & Stop'

for month in tqdm(np.arange(1, 13, 1), "Month", ncols=400, position=0):
    #if month > 5:
    #    continue
    if month > 9:
        n = str(month)+'\\'
    else:
        n = '0'+str(month)+'\\'
    save_path = root_folder+n
    
    # Get a list of the files in that month's folder:
    existing_files = get_files(save_path)
    
    # Make dailypages list for the current month
    dailypages = get_dl_pages(mon_ss[month-1][0], mon_ss[month-1][1], yearurl)
    
    for page in tqdm(dailypages, "Day", ncols=400, position=1):  # One website per day
        try:
            links = get_links(page)  # Grab all links from the one website (expect <=240 links)
        except:
            print("Can't access page", page, "please check if it exists.")
            break
        download_files(links)