## HST ACS-WFC Dataset Preparation

In this notebook, we prepare the **HST ACS-WFC Cosmic Ray dataset**.

1. **Download and Extract Data**  
   The training and test datasets are downloaded from Zenodo (https://zenodo.org/record/4295902) as compressed tar archives and extracted into a base directory.  
   The extracted data are organized into separate training and test folders.

2. **Dataset Organization**  
   After extraction, the data are stored as NumPy files (`.npy`) organized hierarchically by:
   - filter
   - proposal ID
   - visit number  

   Each directory contains image patches, corresponding CR masks, and auxiliary files (e.g., `sky.npy`).

3. **Collect Training and Test File Paths**  
   We recursively traverse the directory structure to gather all valid `.npy` files (excluding sky background files) for:
   - the training set
   - the test set  

4. **Save File Lists**  
   The collected file paths are saved as:
   - `train_dirs.npy`
   - `test_dirs.npy`  

   These files provide a convenient index for loading the HST data during training and evaluation.

After this step, the HST ACS-WFC dataset is fully indexed and ready to be used with CRNet or deepCR-style PyTorch data pipelines.


In [1]:
import numpy as np
import os
import gc
import matplotlib.pyplot as plt
import sys
import shutil
import tarfile
import urllib
%matplotlib inline

# Base directory setup
# os.makedirs('/scratch/srinadb/CRNet/CRNet/deepCR.ACS-WFC')
base_dir = os.path.join('/scratch/srinadb/CRNet/CRNet/deepCR.ACS-WFC')
# os.makedirs(os.path.join(base_dir,'data'))
data_base = os.path.join(base_dir,'data')

def download():
    #Download training data
    print('------------------------------------------------------------')
    print('Downloading training data')
    print('------------------------------------------------------------')
    # urllib.request.urlretrieve('https://zenodo.org/record/4295902/files/deepCR.ACS-WFC.train.tar?download=1')
    
    #Donwload test data
    print('------------------------------------------------------------')
    print('Downloading test data')
    print('------------------------------------------------------------')
    # urllib.request.urlretrieve('https://zenodo.org/record/4295902/files/deepCR.ACS-WFC.test.tar?download=1')
    
    print('Datasets downloaded')
    print('Sorting...')
    # shutil.move('deepCR.ACS-WFC.train.tar',data_base)
    # shutil.move('deepCR.ACS-WFC.test.tar',data_base)
    print('Complete')
    
    print('Extracting tar files...')
    # train_tar = tarfile.open(os.path.join(data_base,'deepCR.ACS-WFC.train.tar'))
    # test_tar = tarfile.open(os.path.join(data_base,'deepCR.ACS-WFC.test.tar'))
    
    # train_tar.extractall(data_base)
    # test_tar.extractall(data_base)
    print('Complete')
    
    return None

#Directories
def get_dirs():
    train_dirs = []
    test_dirs = []

    test_base = os.path.join(data_base,'npy_test')
    train_base = os.path.join(data_base,'npy_train')

    print('------------------------------------------------------------')
    print('Fetching directories for the test set')
    print('------------------------------------------------------------')
    for _filter in os.listdir(test_base):
        filter_dir = os.path.join(test_base,_filter)
        if os.path.isdir(filter_dir):
            for prop_id in os.listdir(filter_dir):
                prop_id_dir = os.path.join(filter_dir,prop_id)
                if os.path.isdir(prop_id_dir):
                    for vis_num in os.listdir(prop_id_dir):
                        vis_num_dir = os.path.join(prop_id_dir,vis_num)
                        if os.path.isdir(vis_num_dir):
                            for f in os.listdir(vis_num_dir):
                                if '.npy' in f and f != 'sky.npy':
                                    test_dirs.append(os.path.join(vis_num_dir,f))

    print('------------------------------------------------------------')
    print('Fetching directories for the training set')
    print('------------------------------------------------------------')
    for _filter in os.listdir(train_base):
        filter_dir = os.path.join(train_base,_filter)
        if os.path.isdir(filter_dir):
            for prop_id in os.listdir(filter_dir):
                prop_id_dir = os.path.join(filter_dir,prop_id)
                if os.path.isdir(prop_id_dir):
                    for vis_num in os.listdir(prop_id_dir):
                        vis_num_dir = os.path.join(prop_id_dir,vis_num)
                        if os.path.isdir(vis_num_dir):
                            for f in os.listdir(vis_num_dir):
                                if '.npy' in f and f != 'sky.npy':
                                    train_dirs.append(os.path.join(vis_num_dir,f))
#     print(train_dirs)
    np.save(os.path.join(base_dir,'test_dirs.npy'), test_dirs)
    np.save(os.path.join(base_dir,'train_dirs.npy'), train_dirs)

    return None

download()
get_dirs()

------------------------------------------------------------
Downloading training data
------------------------------------------------------------
------------------------------------------------------------
Downloading test data
------------------------------------------------------------
Datasets downloaded
Sorting...
Complete
Extracting tar files...
Complete
------------------------------------------------------------
Fetching directories for the test set
------------------------------------------------------------
------------------------------------------------------------
Fetching directories for the training set
------------------------------------------------------------
