In [1]:
import pandas as pd
import numpy as np
from PIL import Image
import requests
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [2]:
#Data to Load
data_path = 'ZooSpecPhotoDR19_torradeflot.csv'

In [3]:
# Load and filter the data
df = pd.read_csv(data_path)
filter1 = ((df.modelMagErr_u < 0.5) & (df.modelMagErr_g < 0.05) & (df.modelMagErr_r < 0.05) & (df.modelMagErr_i < 0.05) & (df.modelMagErr_z < 0.05)
           & ((df.p_cs_debiased >= 0.9) | (df.p_el_debiased >= 0.9)))
df_filtered = df[filter1]

In [4]:
# Function to remove outliers based on quantiles
def remove_outliers(df, low_q = 0.05, high_q = 0.95, cols = [f'modelMag_{f}' for f in "ugriz"]):
    filtered_df = df.copy()
    
    for col in cols:
        low_q_value = filtered_df[col].quantile(low_q)
        high_q_value = filtered_df[col].quantile(high_q)
        filtered_df = filtered_df[(filtered_df[col] >= low_q_value) & (filtered_df[col] <= high_q_value)]
    
    return filtered_df

df_filtered_2 = remove_outliers(df_filtered)

In [5]:
# Select relevant columns
cols = ['dr7objid', 'ra', 'dec', 'p_el_debiased', 'p_cs_debiased', 'spiral', 'elliptical'] + ['petroR50_r', 'petroR90_r'] + [f'modelMag_{f}' for f in "ugriz"] + [f'extinction_{f}' for f in "ugriz"]
df_filtered_3 = df_filtered_2[cols]

In [6]:
# Randomly sample 20000 galaxies
n_samples = 20000
sample = df_filtered_3.sample(n_samples, random_state=42)
sample = sample.reset_index(drop=True)
print(sample)

                 dr7objid         ra        dec  p_el_debiased  p_cs_debiased  \
0      587725470131224792  126.81580  48.720720          0.000          1.000   
1      588016892780216485  139.67630  33.748310          0.029          0.932   
2      588848898835349649  171.87960  -0.982389          0.040          0.960   
3      588017947199864959  170.77540  40.697470          0.925          0.075   
4      588018055128154220  250.62700  27.515810          0.068          0.932   
...                   ...        ...        ...            ...            ...   
19995  587742629594333315  212.56470  16.515470          0.023          0.940   
19996  587724240159768697   57.60279  -6.982139          0.008          0.992   
19997  588013383262994670  129.69650  36.718530          1.000          0.000   
19998  587732136993161383  170.48320  52.851080          0.023          0.956   
19999  587736585505996897  226.90250  34.481190          0.000          0.983   

       spiral  elliptical  

In [None]:
# Load your galaxy catalog
# Must have 'ra' and 'dec' columns
df = sample  

# Base output folder
base_out_dir = Path("galaxy_monoc_jpgs")
base_out_dir.mkdir(exist_ok=True)

# HiPS2FITS API base URL
url = "https://alasky.u-strasbg.fr/hips-image-services/hips2fits"
n_images_per_folder = 1000

In [None]:
# Function to download a single cutout
def download_cutout(index, ra, dec):
    # Determine subfolder
    folder_idx = index // n_images_per_folder
    out_dir = base_out_dir / f"batch_{folder_idx:02d}"
    out_dir.mkdir(exist_ok=True)

    filename = out_dir / f"galaxy_{index}.jpg"
    if filename.exists():  # skip if already downloaded
        return f"Skipped {index}"

    params = {
        "hips": "CDS/P/SDSS9/r",
        "ra": ra,
        "dec": dec,
        "fov": 0.01,     
        "width": 64,
        "height": 64,
        "format": "jpg"
    }

    try:
        response = requests.get(url, params=params, timeout=30)
        if response.status_code == 200:
            with open(filename, "wb") as f:
                f.write(response.content)
            return f"Downloaded {index}"
        else:
            return f"Failed {index}, status {response.status_code}"
    except Exception as e:
        return f"Error {index}: {e}"

# Number of parallel threads
n_threads = 10  # adjust based on your network and server load

# Start parallel downloading
with ThreadPoolExecutor(max_workers=n_threads) as executor:
    futures = [
        executor.submit(download_cutout, i, row["ra"], row["dec"])
        for i, row in df.iterrows()
    ]
    
    for future in as_completed(futures):
        print(future.result())
        time.sleep(0.05)  # small delay to reduce server load



Error 2: HTTPSConnectionPool(host='alasky.u-strasbg.fr', port=443): Max retries exceeded with url: /hips-image-services/hips2fits?hips=CDS%2FP%2FSDSS9%2Fr&ra=171.8796&dec=-0.9823889&fov=0.01&width=64&height=64&format=jpg (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000020B099F5B50>, 'Connection to alasky.u-strasbg.fr timed out. (connect timeout=30)'))
Error 0: HTTPSConnectionPool(host='alasky.u-strasbg.fr', port=443): Max retries exceeded with url: /hips-image-services/hips2fits?hips=CDS%2FP%2FSDSS9%2Fr&ra=126.8158&dec=48.72072&fov=0.01&width=64&height=64&format=jpg (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000020B0AACE190>, 'Connection to alasky.u-strasbg.fr timed out. (connect timeout=30)'))
Error 3: HTTPSConnectionPool(host='alasky.u-strasbg.fr', port=443): Max retries exceeded with url: /hips-image-services/hips2fits?hips=CDS%2FP%2FSDSS9%2Fr&ra=170.7754&dec=40.69747&fov=0.01&width=64&height=64&format=jpg (Caus

In [None]:
# Total number of images to process
read_range = len(sample)
base_out_dir = Path("galaxy_monoc_jpgs")

#List to hold pixel data
pixel_data = []

for idx in range(read_range):
    # Determine subfolder
    folder_idx = idx // n_images_per_folder
    filename = f'{base_out_dir}/batch_{folder_idx:02d}/galaxy_{idx}.jpg'  
    
    # Open image and convert to grayscale (1 channel)
    img_gray = Image.open(filename).convert("L")
    img_arr = np.array(img_gray)

    # Flatten the 2D image array to 1D
    flat_pixels = img_arr.flatten()
    pixel_data.append(flat_pixels)

    # Print progress every 1000 images
    if idx % 1000 == 0:
        print(f"Processed {idx} images")


# Convert list to numpy array
pixel_data = np.array(pixel_data)
pixel_data_normalized = pixel_data.astype(np.float32) / 255.0
print(pixel_data_normalized.shape)
print(pixel_data_normalized)

Processed 0 images
Processed 1000 images
Processed 2000 images
Processed 3000 images
Processed 4000 images
Processed 5000 images
Processed 6000 images
Processed 7000 images
Processed 8000 images
Processed 9000 images
Processed 10000 images
Processed 11000 images
Processed 12000 images
Processed 13000 images
Processed 14000 images
Processed 15000 images
Processed 16000 images
Processed 17000 images
Processed 18000 images
Processed 19000 images
(20000, 4096)
[[0.05098039 0.07450981 0.05490196 ... 0.04705882 0.03921569 0.04705882]
 [0.03137255 0.02745098 0.02352941 ... 0.00784314 0.01960784 0.03137255]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.01960784 0.02352941 0.02352941 ... 0.02745098 0.01960784 0.02352941]
 [0.04313726 0.04313726 0.03137255 ... 0.05098039 0.07450981 0.04313726]
 [0.03921569 0.03137255 0.03921569 ... 0.06666667 0.04705882 0.03529412]]
