# Test download scripts


In [1]:
%reset -f

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import os
import sys
import random
import re
import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from typing import Optional
import requests
from bs4 import BeautifulSoup
from time import sleep
from parfive import Downloader
from parfive import SessionConfig


In [4]:
# Set filepaths
PROJ = Path(os.path.realpath("."))
if str(PROJ) == "/n/home10/shreyasgm":
    PROJ = Path(
        "/n/holystore01/LABS/hausmann_lab/lab/glocal_aggregations/shreyas/proj/2023-02-05 - Pipeline"
    )
ROOT = PROJ.parents[1]
DATA = ROOT / "data/"


In [5]:
# Import custom modules
sys.path.append(str(PROJ))
sys.path.append(str(PROJ / "download_raster/viirs"))
sys.path.append(str(ROOT / "src/"))
from general_utils import *
# from download_viirs import *


In [6]:
# List available rasters
rasters_root_url = "https://eogdata.mines.edu/wwwdata/dmsp/extension_series/"
endswith_filter = ".global.stable_lights.avg_vis.tif"
# Get list of folders listed in the table
soup = BeautifulSoup(requests.get(rasters_root_url).text, "html.parser")
table = soup.find("table")
rows = table.find_all("tr")
rows = rows[1:]
rows = [row.find_all("td") for row in rows]
rows = [[cell.text for cell in row] for row in rows]
folderlist = []
for row in rows:
    for cell in row:
        if cell.endswith("/"):
            folderlist.append(cell)
# For each year, keep only the latest satellite
folder_df = pd.DataFrame({"folder": folderlist})
folder_df[["satellite", "year"]] = folder_df["folder"].str.replace("/", "").str.split("_", expand=True)
folder_df["year"] = folder_df["year"].astype(int)
folder_df = folder_df.sort_values(["year", "satellite"], ascending=[True, False])
# For each year, get the latest satellite
folder_df = folder_df.groupby("year").head(1)
folderlist = folder_df["folder"].tolist()
# Loop through folders and get list of files that matches the filter
download_links = []
for folder in folderlist:
    # Get the table inside each folder
    folder_url = f"{rasters_root_url}{folder}/annual/"
    soup = BeautifulSoup(requests.get(folder_url).text, "html.parser")
    table = soup.find("table")
    rows = table.find_all("tr")
    rows = rows[1:]
    for row in rows:
        rowlink = row.find("a").get("href")
        endswith_met = endswith_filter is None or rowlink.endswith(endswith_filter)
        if rowlink.endswith(".tif") and endswith_met:
            row_download_link = f"{folder_url}{rowlink}"
            download_links.append(row_download_link)

In [7]:
download_links

['https://eogdata.mines.edu/wwwdata/dmsp/extension_series/F15_2013//annual/F15_20130101_20131231.global.stable_lights.avg_vis.tif',
 'https://eogdata.mines.edu/wwwdata/dmsp/extension_series/F15_2014//annual/F15_20140101_20141231.global.stable_lights.avg_vis.tif',
 'https://eogdata.mines.edu/wwwdata/dmsp/extension_series/F15_2015//annual/F15_20150101_20151231.global.stable_lights.avg_vis.tif',
 'https://eogdata.mines.edu/wwwdata/dmsp/extension_series/F16_2016//annual/F16_20160101_20161231.global.stable_lights.avg_vis.tif',
 'https://eogdata.mines.edu/wwwdata/dmsp/extension_series/F16_2017//annual/F16_20170101_20171231.global.stable_lights.avg_vis.tif',
 'https://eogdata.mines.edu/wwwdata/dmsp/extension_series/F16_2018//annual/F16_20180101_20181231.global.stable_lights.avg_vis.tif',
 'https://eogdata.mines.edu/wwwdata/dmsp/extension_series/F16_2019//annual/F16_20190101_20191231.global.stable_lights.avg_vis.tif',
 'https://eogdata.mines.edu/wwwdata/dmsp/extension_series/F16_2020//annual/F