In [1]:
from tqdm.notebook import tqdm

In [2]:
import hashlib
import os
import pathlib

import numpy as np
import pandas as pd
import requests

import vasca.utils as vutils

# Read file manifest

In [3]:
# Read in URLs form the file manifest as DataFrame. Each column corresponds to a
# directory level but URLs have different depth. To handle this, the maximum number of
# levels are inferred by a look-ahead counting the number of levels line by line.

# Dynamically generate column names

# Input
try:
    manifest_path = pathlib.Path(__file__).parent / "pipeDirectoryListing.txt"
except NameError:
    # Handle jupyter notebook problem with __file__
    manifest_path = rf"{os.path.abspath('')}/pipeDirectoryListing.txt"
    



# Delimiter
data_file_delimiter = "/"

# The max column count a line in the file could have
largest_column_count = 0

# Count number of lines
lcounter = 0

# Loop the data lines
with open(manifest_path, "r") as temp_f:
    # Read the lines
    lines = temp_f.readlines()

    for line in lines:
        # Count the column count for the current line
        column_count = len(line.split(data_file_delimiter)) + 1

        # Set the new most column count
        largest_column_count = (
            column_count
            if largest_column_count < column_count
            else largest_column_count
        )

        lcounter += 1

# Generate column names (will be L0, L1, L2, ..., L<largest_column_count - 1>)
column_names = [f"L{i}" for i in range(0, largest_column_count)]

print("URL manifest info:")
print(f"    - Number of lines: {lcounter}")
print(f"    - Maximum number of directory levels: {largest_column_count}")

URL manifest info:
    - Number of lines: 256225
    - Maximum number of directory levels: 17


In [4]:
# Combine indidual level columns with full path
df_manifest = pd.concat(
    [
        pd.read_csv(manifest_path, header=None, delimiter=data_file_delimiter, names=column_names),
        pd.read_csv(manifest_path,delimiter=",", header=None, names= ["path"])
    ],
    axis=1
)

In [5]:
df_manifest
# L8: Drift scan (15 continous scans)
# L12: Visit (i.e, scan repetition, 20 visits on average)
# L13: image/catalog data ("svXX" corresponds to field number,
#      short scans (1−3, 13−15) consist of 9 fields,
#      long scans (4-12) coinsist of 14 fields each).


Unnamed: 0,L0,L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,L11,L12,L13,L14,L15,L16,path
0,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0020-img,02-try,KEPLER_SCAN_001_0020_sv06-nd-flag_varpix.fits.gz,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
1,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0020-img,02-try,KEPLER_SCAN_001_0020_sv03-nd-sexparams_orig.txt,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
2,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0020-img,02-try,KEPLER_SCAN_001_0020_sv08-nd-skybg.fits.gz,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
3,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0020-img,02-try,KEPLER_SCAN_001_0020_sv09-nd-cat_mch_rtastar.fits,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
4,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0020-img,02-try,KEPLER_SCAN_001_0020_sv02-nd-objmask_out.txt,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256220,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29206-KEPLER_SCAN_007,d,00-visits,0018-img,02-try,KEPLER_SCAN_007_0018_sv09-nd-varpix.fits.gz,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
256221,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29206-KEPLER_SCAN_007,d,00-visits,0018-img,02-try,KEPLER_SCAN_007_0018_sv12-nd-intbgsub.fits.gz,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
256222,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29206-KEPLER_SCAN_007,d,00-visits,0018-img,02-try,KEPLER_SCAN_007_0018_sv08-nd-movie.fits.gz,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
256223,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29206-KEPLER_SCAN_007,d,00-visits,0018-img,02-try,KEPLER_SCAN_007_0018_sv09-rtastar.fits,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...


In [6]:
np.unique(df_manifest["L8"])

array(['29200-KEPLER_SCAN_001', '29201-KEPLER_SCAN_002',
       '29202-KEPLER_SCAN_003', '29203-KEPLER_SCAN_004',
       '29204-KEPLER_SCAN_005', '29205-KEPLER_SCAN_006',
       '29206-KEPLER_SCAN_007', '29207-KEPLER_SCAN_008',
       '29208-KEPLER_SCAN_009', '29209-KEPLER_SCAN_010',
       '29210-KEPLER_SCAN_011', '29211-KEPLER_SCAN_012',
       '29212-KEPLER_SCAN_013', '29213-KEPLER_SCAN_014',
       '29214-KEPLER_SCAN_015'], dtype=object)

In [7]:
np.unique(df_manifest["L11"])

array(['0001-img', '0002-img', '0003-img', '0004-img', '0005-img',
       '0006-img', '0007-img', '0008-img', '0009-img', '0010-img',
       '0011-img', '0012-img', '0013-img', '0014-img', '0015-img',
       '0016-img', '0017-img', '0018-img', '0019-img', '0020-img',
       '0021-img', '0022-img', '0023-img', '0024-img', '0025-img',
       '0026-img', '0027-img', '0028-img', '0029-img'], dtype=object)

In [8]:
df_manifest.query("L8 in ['29200-KEPLER_SCAN_001'] and L11 in ['0012-img']").query(
            "L13.str.contains('nd-count.fits|nd-rrhr.fits|nd-int.fits|nd-flags.fits|xd-mcat.fits')"
        ).sort_values("L13")

Unnamed: 0,L0,L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,L11,L12,L13,L14,L15,L16,path
3116,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0012-img,02-try,KEPLER_SCAN_001_0012_sv03-nd-flags.fits,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
3084,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0012-img,02-try,KEPLER_SCAN_001_0012_sv03-nd-int.fits.gz,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
3058,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0012-img,02-try,KEPLER_SCAN_001_0012_sv03-nd-rrhr.fits.gz,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
3282,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0012-img,02-try,KEPLER_SCAN_001_0012_sv03-xd-mcat.fits,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
3306,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0012-img,02-try,KEPLER_SCAN_001_0012_sv04-nd-flags.fits,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
3198,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0012-img,02-try,KEPLER_SCAN_001_0012_sv04-nd-int.fits.gz,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
3264,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0012-img,02-try,KEPLER_SCAN_001_0012_sv04-nd-rrhr.fits.gz,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
3197,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0012-img,02-try,KEPLER_SCAN_001_0012_sv04-xd-mcat.fits,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
3235,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0012-img,02-try,KEPLER_SCAN_001_0012_sv05-nd-flags.fits,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...
3128,http:,,archive.stsci.edu,pub,galex,KS,pipe,01-vsn,29200-KEPLER_SCAN_001,d,00-visits,0012-img,02-try,KEPLER_SCAN_001_0012_sv05-nd-int.fits.gz,,,,http://archive.stsci.edu/pub/galex/KS/pipe/01-...


# Dowload data

In [9]:
def get_n_fields(id_scan):
    """
    Returns the number of sub-fields per drift scan:
    9 and 14 fields for the short scans (1−3 and 13−15) and long scans (4−12)
    """
    scan_num = int(id_scan.split("_")[-1])
    if scan_num <= 3 or scan_num >= 13:
        # short scans
        n_fields = 9
    elif scan_num >= 4 and scan_num <= 12:
        # long scans
        n_fields = 14
    else:
        raise ValueError(f"Unexpected scan ID '{id_scan}'.")
    
    return n_fields

In [10]:
# Settings

# Path to root data directory
root_data_dir = "/Users/julianschliwinski/GALEX_DS/galex_kepler"

# Refresh download (possibly overwrites existing files)
refresh = False

# Supress progress bar
hide_progress=True

# Dry-run, no download
dry_run = True

In [11]:
# Group by scan and visit
df_grpd = df_manifest.groupby(["L8", "L11"])

In [12]:
# Counters
n_files = 0
n_vis = 0

# Loops over drift scans and visits
for i, (id_scan, id_visit) in tqdm(
    enumerate(df_grpd.groups.keys()), disable=hide_progress
):
    # Debugging
    # if i > 100: break
    # if id_scan != "29200-KEPLER_SCAN_001":
    #     break
    # if id_visit != "0012_img":
    #     continue

    # Loops over fields (sv = "sub-visit")
    for k, id_sv in tqdm(
        enumerate([f"sv{num+1:02}" for num in range(get_n_fields(id_scan))]),
        total=get_n_fields(id_scan),
        desc=f"Fields ({id_scan}, {id_visit})",
        disable=hide_progress,
    ):
        # Debugging
        # if id_sv != "sv01":
        #     break

        # List of file name endings
        files_select = [
            "nd-cnt.fits",
            "nd-rrhr.fits",
            "nd-int.fits",
            "nd-flags.fits",
            "xd-mcat.fits",
        ]
        n_files_select = len(files_select)

        # Selects files corresponding to field ID
        files_select_str = "|".join(files_select)
        query_str = (
            f"L13.str.contains('{files_select_str}') and "
            f"L13.str.contains('{id_sv}')"
        )
        df_select = df_grpd.get_group((id_scan, id_visit)).query(query_str)
        df_select.sort_values("L13", inplace=True)

        # Download data
        if len(df_select) > 0:
            if len(df_select) != n_files_select:
                print(
                    f"Warning: unexpected number of files ({n_files_select}), "
                    f"got {len(df_select)} "
                    f"({id_scan}, {id_sv}, {id_visit})"
                )

            # Create output directory path
            # Data is sorted by scan and visit (swap field and visit directories
            # compared to web server)
            out_dir = (
                f"{root_data_dir}/"
                f"{id_scan}/{id_scan}_{id_sv}/"
                f"{id_scan}_{id_sv}_{id_visit}"
            )

            # Create the output directory if it doesn't exist
            os.makedirs(out_dir, exist_ok=True)

            # Loops over data files
            for idx, file_row in df_select.iterrows():
                # File name
                file_name = file_row.L13

                file_url = file_row.path

                # Construct the output file path
                out_file_path = os.path.join(out_dir, file_name)

                # Download the file if it doesn't already exist or download is forced
                if not dry_run:
                    if not os.path.isfile(out_file_path) or refresh:
                        response = requests.get(file_url)
                        if response.status_code == 200:
                            with open(out_file_path, "wb") as f:
                                f.write(response.content)
                            print(f"Downloaded: {file_url}")
                        else:
                            print(f"Failed to download: {file_url}")
                    else:
                        print(f"File exists: {file_url}")

            # counts number of files
            n_files += len(df_select)

    # Counts number of visits
    n_vis += 1

print(
    f"Downloaded {n_files/n_files_select:1.1f} exposures "
    f"({n_files} files) for {n_vis} visits."
)

Downloaded 3249.4 exposures (16247 files) for 298 visits.


In [13]:
# Average number of visits
n_fields = list()
for id_scan in np.unique(df_manifest["L8"]):
    
    num = len(np.unique(df_manifest.query("L8 == @id_scan")["L11"]))
    n_fields.append(num)

n_fields = np.array(n_fields)
n_fields.mean()

19.866666666666667