Skip to content

Commit

Permalink
Merge pull request #205 from astrophpeter/data-init
Browse files Browse the repository at this point in the history
Passes with latest commit e427132.
  • Loading branch information
manning-ncsa committed Apr 24, 2024
2 parents 8ecdf6f + e427132 commit d9a989d
Show file tree
Hide file tree
Showing 20 changed files with 642 additions and 245 deletions.
7 changes: 7 additions & 0 deletions app/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
FROM python:3.11 as mc

RUN curl https://dl.min.io/client/mc/release/linux-amd64/mc -o /usr/local/bin/mc && \
chmod +x /usr/local/bin/mc

FROM python:3.11
ENV PYTHONUNBUFFERED 1

Expand All @@ -22,3 +27,5 @@ RUN mkdir /app

COPY . /app
WORKDIR /app

COPY --from=mc /usr/local/bin/mc /usr/local/bin/mc
38 changes: 21 additions & 17 deletions app/app/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,23 +150,27 @@


MEDIA_URL = "/cutouts/"
# os.path.join(os.path.dirname(BASE_DIR), '../cutout_cdn')

MEDIA_ROOT = os.path.join(os.path.dirname(BASE_DIR), "../data")
CUTOUT_ROOT = os.path.join(os.path.dirname(BASE_DIR), "../data/cutout_cdn")
SED_OUTPUT_ROOT = os.path.join(os.path.dirname(BASE_DIR), "../data/sed_output")
SBI_TRAINING_ROOT = os.path.join(os.path.dirname(BASE_DIR), "../data/sbi_training_sets")
GHOST_OUTPUT_ROOT = os.path.join(os.path.dirname(BASE_DIR), "../ghost_output")
GHOST_DUST_PATH = os.path.join(
os.path.dirname(BASE_DIR), "../data/ghost_data/dust_model"
)
GHOST_PHOTOZ_PATH = os.path.join(
os.path.dirname(BASE_DIR), "../data/ghost_data/photoz_model/MLP_lupton.hdf5"
)
TNS_STAGING_ROOT = os.path.join(os.path.dirname(BASE_DIR), "../tns_staging")
TRANSMISSION_CURVES_ROOT = os.path.join(os.path.dirname(BASE_DIR), "../transmission")
SBIPP_ROOT = os.path.join(os.path.dirname(BASE_DIR), "../sbipp")
SBIPP_PHOT_ROOT = os.path.join(os.path.dirname(BASE_DIR), "../sbipp_phot")

DUSTMAPS_DATA_ROOT = os.environ.get("DUSTMAPS_DATA_ROOT", "/data/dustmaps") # noqa
CUTOUT_ROOT = os.environ.get("CUTOUT_ROOT", "/data/cutout_cdn") # noqa
SED_OUTPUT_ROOT = os.environ.get("SED_OUTPUT_ROOT", "/data/sed_output") # noqa
SBI_TRAINING_ROOT = os.environ.get(
"SBI_TRAINING_ROOT", "/data/sbi_training_sets"
) # noqa
GHOST_OUTPUT_ROOT = os.environ.get("GHOST_OUTPUT_ROOT", "/data/ghost_output") # noqa
GHOST_DATA_ROOT = os.environ.get("GHOST_DATA_ROOT", "/data/ghost_data") # noqa
GHOST_DUST_PATH = os.environ.get(
"GHOST_DUST_PATH", os.path.join(GHOST_DATA_ROOT, "dust_model")
) # noqa
GHOST_PHOTOZ_PATH = os.environ.get(
"GHOST_PHOTOZ_PATH", os.path.join(GHOST_DATA_ROOT, "photoz_model/MLP_lupton.hdf5")
) # noqa
TNS_STAGING_ROOT = os.environ.get("TNS_STAGING_ROOT", "/data/tns_staging") # noqa
SBIPP_ROOT = os.environ.get("SBIPP_ROOT", "/data/sbipp") # noqa
SBIPP_PHOT_ROOT = os.environ.get("SBIPP_PHOT_ROOT", "/data/sbipp_phot") # noqa
TRANSMISSION_CURVES_ROOT = os.environ.get(
"TRANSMISSION_CURVES_ROOT", "/data/transmission"
) # noqa

CUTOUT_OVERWRITE = os.environ.get("CUTOUT_OVERWRITE", "False")

Expand Down
4 changes: 0 additions & 4 deletions app/app/settings_slim.py

This file was deleted.

367 changes: 367 additions & 0 deletions app/entrypoints/blast-data.md5sums

Large diffs are not rendered by default.

10 changes: 8 additions & 2 deletions app/entrypoints/docker-entrypoint.app.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,17 @@ if [[ ! -f "/root/.dustmapsrc" && -f "/tmp/.dustmapsrc" ]]; then
cp /tmp/.dustmapsrc /root/.dustmapsrc
fi

INIT_STARTED_DB=/mnt/data/.initializing_db
INIT_STARTED_DATA=/mnt/data/.initializing_data
INIT_STARTED_DATA="${DATA_ROOT_DIR}/.initializing_data"
INIT_STARTED_DB="${DATA_ROOT_DIR}/.initializing_db"

if [[ "${FORCE_INITIALIZATION}" == "true" ]]; then
rm -f "${INIT_STARTED_DATA}"
rm -f "${INIT_STARTED_DB}"
fi

## Initialize astro data
##

if [[ -f "${INIT_STARTED_DATA}" ]]
then
echo "Astro data is currently being initialized (\"${INIT_STARTED_DATA}\" exists)."
Expand Down
160 changes: 59 additions & 101 deletions app/entrypoints/initialize_all_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,119 +9,77 @@ cd "${SCRIPT_DIR}"

bash initialize_data_dirs.sh

# TODO: As the data archive file gets larger, this may exceed
# local disk space. If we assume bulk storage is mounted at
# /mnt/data, it may be better to use /mnt/data/tmp.
cd /tmp

if [[ "${USE_DATA_ARCHIVE}" == "true" ]]; then
##
## Install data files from compiled archive
##

# TODO: These are not comprehensive data integrity checks;
# we are only spot-checking the data directories.
if [[ "${FORCE_DATA_DOWNLOAD}" != "true" && \
-f "/fsps/README.md" && \
-f "/sbipp_phot/sbi_phot_local.h5" && \
-f "/dustmaps/sfd/SFD_dust_4096_ngp.fits" && \
-f "/transmission/2MASS_H.txt" ]]
then
echo "Required data files already downloaded."
else
if [[ "${USE_LOCAL_ARCHIVE_FILE}" == "true" ]]; then
echo "Installing data from archive file \"${DATA_ARCHIVE_FILE}\"..."
if [[ ! -f "${DATA_ARCHIVE_FILE}" ]]; then
echo "Data archive file \"${DATA_ARCHIVE_FILE}\" not found. Aborting."
exit 1
fi
else
if [[ -f "${DATA_ARCHIVE_FILE}" ]]; then
echo "Data archive file already downloaded."
else
echo "Downloading data archive file from \"${DATA_ARCHIVE_FILE_URL}\"..."
curl -LJO "${DATA_ARCHIVE_FILE_URL}"
echo "Download complete."
fi
fi

# Extract and install the data files
echo "Extracting data archive..."
tar -xzf "${DATA_ARCHIVE_FILE}"
echo "Data extracted. Installing data files..."
rsync -va data/cutout_cdn/2010ag/ /data/cutout_cdn/2010ag/
rsync -va data/cutout_cdn/2010ai/ /data/cutout_cdn/2010ai/
rsync -va data/cutout_cdn/2010H/ /data/cutout_cdn/2010H/
rsync -va data/sed_output/2010H/ /data/sed_output/2010H/
rsync -va data/sbipp/ /sbipp/
rsync -va data/transmission/ /transmission/
rsync -va data/fsps/ /fsps/
rsync -va data/sbipp_phot/ /sbipp_phot/
rsync -va data/dustmaps/ /dustmaps/
rsync -va data/ghost_data/ /data/ghost_data/
rsync -va data/sbi_training_sets/ /data/sbi_training_sets/
echo "Data installed."

# Clean up temporary files
if [[ "${USE_LOCAL_ARCHIVE_FILE}" != "true" ]]; then
# Ignore error upon deletion to support cases where the file is mounted read-only
set +e
rm -f "${DATA_ARCHIVE_FILE}"
set -e
fi
rm -rf data
extract_data_archive_file() {
local file_path=$1
local extract_dir=$2
local original_dir=$(pwd)
echo "INFO: Installing data from archive file \"${file_path}\"..."
if [[ ! -f "${file_path}" ]]; then
echo "ERROR: Data archive file \"${file_path}\" not found. Aborting."
return 1
fi

else

##
## Install data files from original sources
##

if [[ -f "/fsps/README.md" ]]
echo "Extracting data archive..."
# Data archive file has top-level directory "data"
cd "${extract_dir}"
tar --strip-components=1 -xzf "${DATA_ARCHIVE_FILE}"
cd "${original_dir}"
}

verify_data_integrity() {
# Verify data file integrity.
local data_root_dir=$1
local original_dir=$(pwd)
cd "${data_root_dir}"
set +e
md5sum --check --status "${SCRIPT_DIR}/blast-data.md5sums"
DATA_INTEGRITY_VALID=$?
set -e
cd "${original_dir}"
if [[ "${DATA_INTEGRITY_VALID}" == "0" ]]
then
echo "fsps files already downloaded"
echo "INFO: Required data files pass integrity check."
return 0
else
echo "downloading fsps files"
set -e
git clone https://github.com/cconroy20/fsps.git /fsps
set +e
rm -rf /fsps/.git
echo "ERROR: Required data files failed integrity check."
return 1
fi

if [[ -f "/sbipp_phot/sbi_phot_local.h5" ]]
}

download_data_archive() {
local data_root_dir=$1
echo "INFO: Downloading data from archive..."
mc alias set blast https://js2.jetstream-cloud.org:8001 anonymous
# The trailing slashes are important!
mc mirror --overwrite --json blast/blast-astro-data/v1/data/ "$(readlink -f "${data_root_dir}")/"
}

# Verify data file integrity and attempt to (re)install required files if necessary
if ! verify_data_integrity "${DATA_ROOT_DIR}"
then
# Download and install data from archive
if [[ "${USE_LOCAL_ARCHIVE_FILE}" == "true" ]]
then
echo "SBI/files already downloaded"
# Extract data from local archive file
extract_data_archive_file "${DATA_ARCHIVE_FILE}" "${DATA_ROOT_DIR}"
else
echo "downloading SBI files"
set -e
curl -LJO https://zenodo.org/records/10703208/files/sbi_phot_global.h5
curl -LJO https://zenodo.org/records/10703208/files/sbi_phot_local.h5
mv sbi_phot_global.h5 /sbipp_phot/
mv sbi_phot_local.h5 /sbipp_phot/
set +e
# Download data from remote archive
download_data_archive "${DATA_ROOT_DIR}"
fi

if [[ -f "data/transmission/2MASS_H.txt" ]]
# Verify data file integrity
if ! verify_data_integrity "${DATA_ROOT_DIR}"
then
echo "Remaining data already downloaded"
else
set -e
git clone https://github.com/astrophpeter/blast.git /tmp/blast
cd /tmp/blast
rsync -va data/cutout_cdn/2010ag/ /data/cutout_cdn/2010ag/
rsync -va data/cutout_cdn/2010ai/ /data/cutout_cdn/2010ai/
rsync -va data/cutout_cdn/2010H/ /data/cutout_cdn/2010H/
rsync -va data/sed_output/2010H/ /data/sed_output/2010H/
rsync -va data/sbipp/ /sbipp/
rsync -va data/transmission/ /transmission/
set +e
rm -rf /tmp/blast
echo "ERROR: Downloaded/extracted data files failed integrity check. Aborting."
exit 1
fi

echo "Data installed."
fi

cd "${SCRIPT_DIR}"/..
python init_data.py
# Skip redundant installation of dustmap data and config file, where "init_data.py"
# executes "app/entrypoints/initialize_dustmaps.py", which downloads SFD files
# if they are missing and initializes a ".dustmapsrc" file.
# cd "${SCRIPT_DIR}"/..
# python init_data.py

echo "Data initialization complete."
86 changes: 50 additions & 36 deletions app/entrypoints/initialize_data_dirs.sh
Original file line number Diff line number Diff line change
@@ -1,54 +1,68 @@
#!/bin/bash
set -e

cd /mnt/data

mkdir -p cutout_cdn
mkdir -p sed_output
mkdir -p ghost_data
mkdir -p ghost_output
mkdir -p tns_staging
mkdir -p transmission
mkdir -p dustmaps
mkdir -p fsps
mkdir -p sbipp
mkdir -p sbipp_phot
mkdir -p sbi_training_sets
# Ensure data root directory is configured
if [[ "${DATA_ROOT_DIR}x" == "x" ]]; then
echo "ERROR: DATA_ROOT_DIR environment variable must not be empty. Aborting."
exit 1
fi

mkdir -p /data/
cd "${DATA_ROOT_DIR}"

# The creation of symlinks should error if there is a non-symlink
# file or folder where the symlink should be.
if [[ ! -L "/data/cutout_cdn" ]]; then
ln -s /mnt/data/cutout_cdn /data/cutout_cdn
if [[ ! -L "${CUTOUT_ROOT}" ]]; then
mkdir -p "${DATA_ROOT_DIR}"/cutout_cdn
mkdir -p "$(dirname "${CUTOUT_ROOT}")"
ln -s "${DATA_ROOT_DIR}/cutout_cdn" "${CUTOUT_ROOT}"
fi
if [[ ! -L "/data/sed_output" ]]; then
ln -s /mnt/data/sed_output /data/sed_output
if [[ ! -L "${SED_OUTPUT_ROOT}" ]]; then
mkdir -p "${DATA_ROOT_DIR}"/sed_output
mkdir -p "$(dirname "${SED_OUTPUT_ROOT}")"
ln -s "${DATA_ROOT_DIR}/sed_output" "${SED_OUTPUT_ROOT}"
fi
if [[ ! -L "/data/ghost_data" ]]; then
ln -s /mnt/data/ghost_data /data/ghost_data
if [[ ! -L "${GHOST_DATA_ROOT}" ]]; then
mkdir -p "${DATA_ROOT_DIR}"/ghost_data
mkdir -p "$(dirname "${GHOST_DATA_ROOT}")"
ln -s "${DATA_ROOT_DIR}/ghost_data" "${GHOST_DATA_ROOT}"
fi
if [[ ! -L "/ghost_output" ]]; then
ln -s /mnt/data/ghost_output /ghost_output
if [[ ! -L "${GHOST_OUTPUT_ROOT}" ]]; then
mkdir -p "${DATA_ROOT_DIR}"/ghost_output
mkdir -p "$(dirname "${GHOST_OUTPUT_ROOT}")"
ln -s "${DATA_ROOT_DIR}/ghost_output" "${GHOST_OUTPUT_ROOT}"
fi
if [[ ! -L "/tns_staging" ]]; then
ln -s /mnt/data/tns_staging /tns_staging
if [[ ! -L "${TNS_STAGING_ROOT}" ]]; then
mkdir -p "${DATA_ROOT_DIR}"/tns_staging
mkdir -p "$(dirname "${TNS_STAGING_ROOT}")"
ln -s "${DATA_ROOT_DIR}/tns_staging" "${TNS_STAGING_ROOT}"
fi
if [[ ! -L "/transmission" ]]; then
ln -s /mnt/data/transmission /transmission
if [[ ! -L "${TRANSMISSION_CURVES_ROOT}" ]]; then
mkdir -p "${DATA_ROOT_DIR}"/transmission
mkdir -p "$(dirname "${TRANSMISSION_CURVES_ROOT}")"
ln -s "${DATA_ROOT_DIR}/transmission" "${TRANSMISSION_CURVES_ROOT}"
fi
if [[ ! -L "/dustmaps" ]]; then
ln -s /mnt/data/dustmaps /dustmaps
if [[ ! -L "${DUSTMAPS_DATA_ROOT}" ]]; then
mkdir -p "${DATA_ROOT_DIR}"/dustmaps
mkdir -p "$(dirname "${DUSTMAPS_DATA_ROOT}")"
ln -s "${DATA_ROOT_DIR}/dustmaps" "${DUSTMAPS_DATA_ROOT}"
fi
if [[ ! -L "/fsps" ]]; then
ln -s /mnt/data/fsps /fsps
if [[ ! -L "${SPS_HOME}" ]]; then
mkdir -p "${DATA_ROOT_DIR}"/fsps
mkdir -p "$(dirname "${SPS_HOME}")"
ln -s "${DATA_ROOT_DIR}/fsps" "${SPS_HOME}"
fi
if [[ ! -L "/sbipp" ]]; then
ln -s /mnt/data/sbipp /sbipp
if [[ ! -L "${SBIPP_ROOT}" ]]; then
mkdir -p "${DATA_ROOT_DIR}"/sbipp
mkdir -p "$(dirname "${SBIPP_ROOT}")"
ln -s "${DATA_ROOT_DIR}/sbipp" "${SBIPP_ROOT}"
fi
if [[ ! -L "/sbipp_phot" ]]; then
ln -s /mnt/data/sbipp_phot /sbipp_phot
if [[ ! -L "${SBIPP_PHOT_ROOT}" ]]; then
mkdir -p "${DATA_ROOT_DIR}"/sbipp_phot
mkdir -p "$(dirname "${SBIPP_PHOT_ROOT}")"
ln -s "${DATA_ROOT_DIR}/sbipp_phot" "${SBIPP_PHOT_ROOT}"
fi
if [[ ! -L "/data/sbi_training_sets" ]]; then
ln -s /mnt/data/sbi_training_sets /data/sbi_training_sets
if [[ ! -L "${SBI_TRAINING_ROOT}" ]]; then
mkdir -p "${DATA_ROOT_DIR}"/sbi_training_sets
mkdir -p "$(dirname "${SBI_TRAINING_ROOT}")"
ln -s "${DATA_ROOT_DIR}/sbi_training_sets" "${SBI_TRAINING_ROOT}"
fi
3 changes: 1 addition & 2 deletions app/entrypoints/initialize_dustmaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
from django.conf import settings
import os

media_root = settings.MEDIA_ROOT
config.reset()

config["data_dir"] = f"{media_root}/../dustmaps"
config["data_dir"] = settings.DUSTMAPS_DATA_ROOT

# Download data if is missing
for data_file in [
Expand Down
Loading

0 comments on commit d9a989d

Please sign in to comment.