# Test download scripts


In [1]:
%reset -f

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import os
import sys
import random
import re
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from typing import Optional
import requests
from bs4 import BeautifulSoup
from time import sleep
from parfive import Downloader
from parfive import SessionConfig


In [4]:
# Set filepaths
PROJ = Path(os.path.realpath("."))
if str(PROJ) == "/n/home10/shreyasgm":
    PROJ = Path(
        "/n/holystore01/LABS/hausmann_lab/lab/glocal_aggregations/shreyas/proj/2023-02-05 - Pipeline/dmsp"
    )
ROOT = PROJ.parents[2]
DATA = ROOT / "data/"


In [5]:
# Import custom modules
sys.path.append(str(PROJ))
sys.path.append(str(PROJ / "download_raster/viirs"))
sys.path.append(str(ROOT / "src/"))
from general_utils import *

# from download_viirs import *


In [6]:
def get_table_rows(url):
    # Parse html, get list of folders from table with id "indexlist"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    table = soup.find("table", {"id": "indexlist"})
    # Get all rows and folder names from each row
    rows = table.find_all("tr")
    return rows


In [7]:
# List available rasters
rasters_root_url = "https://eogdata.mines.edu/wwwdata/dmsp/v4composites_rearrange/"
rows = get_table_rows(rasters_root_url)
dmsp_folders_list = []
for row in rows:
    rowlink = row.find("a").get("href")
    if rowlink.endswith("/"):
        dmsp_folders_list.append(rowlink)


In [8]:
# For each folder, open and get table rows again
dmsp_folder_filenames = {}
for dmsp_folder in dmsp_folders_list:
    rows = get_table_rows(rasters_root_url + dmsp_folder)
    folder_lights_url = None
    folder_cf_cvg_url = None
    for row in rows:
        rowlink = row.find("a").get("href")
        if rowlink.endswith(".global.intercal.stable_lights.avg_vis.tif"):
            folder_lights_filename = rowlink
        if rowlink.endswith(".global.cf_cvg.tif"):
            folder_cf_cvg_filename = rowlink
    dmsp_folder_filenames[dmsp_folder] = (
        folder_lights_filename,
        folder_cf_cvg_filename,
    )


In [12]:
dmsp_folder = dmsp_folders_list[0]

rows = get_table_rows(rasters_root_url + dmsp_folder)
row = rows[1]
# Get the element of row with class indexcolsize
rowsize = row.find("td", {"class": "indexcolsize"}).text
rowsize

'116M'

In [24]:
# Create a pandas dataframe with folder names
dmsp_folders = (
    pd.DataFrame.from_dict(
        dmsp_folder_filenames,
        orient="index",
        columns=["stable_lights_filename", "cf_cvg_filename"],
    )
    .reset_index()
    .rename(columns={"index": "folder"})
)
# Remove / at the end
dmsp_folders["folder"] = dmsp_folders["folder"].str[:-1]
# Split folder into satellite and year
dmsp_folders[["satellite", "year"]] = dmsp_folders["folder"].str.split("_", expand=True)
# Remove F from satellite name
dmsp_folders["satellite"] = dmsp_folders["satellite"].str[1:]
# Convert satellite number to int
dmsp_folders["satellite"] = dmsp_folders["satellite"].astype(int)
# For each year, consider the latest available satellite
dmsp_folders = dmsp_folders.sort_values(["year", "satellite"], ascending=[True, False])
dmsp_folders = dmsp_folders.drop_duplicates(subset=["year"], keep="first")
# Get download url for stable lights
dmsp_folders["stable_lights_url"] = (
    rasters_root_url + dmsp_folders["folder"] + "/" + dmsp_folders["stable_lights_filename"]
)
# Get download url for cloud free coverage
dmsp_folders["cf_cvg_url"] = rasters_root_url + dmsp_folders["folder"] + "/" + dmsp_folders["cf_cvg_filename"]
dmsp_folders.head()


Unnamed: 0,folder,stable_lights_filename,cf_cvg_filename,satellite,year,stable_lights_url,cf_cvg_url
0,F10_1992,F101992.v4b.global.intercal.stable_lights.avg_...,F101992.v4b.global.cf_cvg.tif,10,1992,https://eogdata.mines.edu/wwwdata/dmsp/v4compo...,https://eogdata.mines.edu/wwwdata/dmsp/v4compo...
1,F10_1993,F101993.v4b.global.intercal.stable_lights.avg_...,F101993.v4b.global.cf_cvg.tif,10,1993,https://eogdata.mines.edu/wwwdata/dmsp/v4compo...,https://eogdata.mines.edu/wwwdata/dmsp/v4compo...
3,F12_1994,F121994.v4b.global.intercal.stable_lights.avg_...,F121994.v4b.global.cf_cvg.tif,12,1994,https://eogdata.mines.edu/wwwdata/dmsp/v4compo...,https://eogdata.mines.edu/wwwdata/dmsp/v4compo...
4,F12_1995,F121995.v4b.global.intercal.stable_lights.avg_...,F121995.v4b.global.cf_cvg.tif,12,1995,https://eogdata.mines.edu/wwwdata/dmsp/v4compo...,https://eogdata.mines.edu/wwwdata/dmsp/v4compo...
5,F12_1996,F121996.v4b.global.intercal.stable_lights.avg_...,F121996.v4b.global.cf_cvg.tif,12,1996,https://eogdata.mines.edu/wwwdata/dmsp/v4compo...,https://eogdata.mines.edu/wwwdata/dmsp/v4compo...


In [25]:
dmsp_folders["stable_lights_url"].iloc[0]

'https://eogdata.mines.edu/wwwdata/dmsp/v4composites_rearrange/F10_1992/F101992.v4b.global.intercal.stable_lights.avg_vis.tif'

In [14]:
# Set an environment variable to store your password
os.environ["EOG_PASSWORD"] = "your_password"


In [18]:
# Get download links
download_links = list_available_rasters(endswith_filter=".median_masked.dat.tif.gz")


In [38]:
# Get files to download
download_dir = DATA / "raw/rasters/viirs/vnl_v2.1"
files_to_download = get_files_to_download(download_links, download_dir)
# Get headers
headers = get_headers()


In [39]:
len(files_to_download)


1

In [40]:
files_to_download[0:2]


['https://eogdata.mines.edu/nighttime_light/annual/v21/2017/VNL_v21_npp_2017_global_vcmslcfg_c202205302300.median_masked.dat.tif.gz']

In [41]:
# Use parfive to download files
dl = Downloader(max_conn=1, max_splits=3, progress=True, overwrite=False)
for file_to_queue in files_to_download:
    dl.enqueue_file(file_to_queue, path=download_dir, headers=headers)
# Download
res = dl.download()


Files Downloaded:   0%|          | 0/1 [00:00<?, ?file/s]

VNL_v21_npp_2017_global_vcmslcfg_c202205302300.median_masked.dat.tif.gz:   0%|          | 0.00/287M [00:00<?, …

In [42]:
res.errors


[]

In [43]:
# if any files error out, retry
if res.errors:
    sleep(5)
    dl.retry(res)


In [46]:
# Unpack each file
for file in tqdm(list(download_dir.glob("*.gz"))):
    # Check if already unzipped
    if not file.with_suffix("").exists():
        unpack_file(file)


100%|██████████| 11/11 [05:50<00:00, 31.87s/it]
