CELL 1 - this cell is to import all the functions that i may need in the actual code itself, now with the capabilities of using the matplotlib widget tools for interactive graph sequences

In [None]:
# cell 1 - imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import time
from datetime import datetime
import requests
import math

from astropy.io import fits
from astropy.visualization import (ZScaleInterval, ImageNormalize)
from astropy.wcs import WCS
from photutils.detection import DAOStarFinder
from photutils.aperture import (aperture_photometry, CircularAperture)
from astroquery.vizier import Vizier
from reproject import reproject_interp
from astroalign import register
from lightkurve import search_lightcurvefile
import aplpy

from skimage import filters, measure, morphology
from skimage.measure import label, regionprops
from skimage.morphology import binary_closing, disk
from matplotlib.patches import Ellipse
import matplotlib.pyplot as plt
import cv2

import plotly.graph_objects as go

horizons_url = 'https://ssd.jpl.nasa.gov/api/horizons_file.api'

In [None]:
#ONLY RUN WIDGET AFTER MAIN SCRIPT OR IT WILL BLOW UP YOUR RAM CHIPS
#%matplotlib widget

CELL 2 - this one is to create the time summary sequnce to know what parts of the code are taking the longest, keep in mind it refreshes only when you actually re-run this block itself

In [None]:
#cell 2 - timer utilities

time_records = []

def start_section(section_name):
    now = time.time()
    time_records.append({
        'section': section_name,
        'start': now,
        'end': None,
        'duration': None
    })

def end_section():
    now = time.time()
    if not time_records:
        return
    record = time_records[-1]
    record['end'] = now
    record['duration'] = now - record['start']

def print_time_summary():
    if not time_records:
        return
    
    print("\n=== PROCESSING TIME SUMMARY ===")
    max_len = max(len(r['section']) for r in time_records)
    name_col_width = max(20, max_len + 2)
    print(f"{'Section':<{name_col_width}}  {'Start':>15}   {'End':>15}   {'Duration (s)':>14}")
    print("-" * (name_col_width + 50))
    
    total_time = time_records[-1]['end'] - time_records[0]['start']
    
    for r in time_records:
        section = r['section']
        start_dt = datetime.fromtimestamp(r['start']).strftime("%H:%M:%S.%f")[:-3]
        end_dt = datetime.fromtimestamp(r['end']).strftime("%H:%M:%S.%f")[:-3] if r['end'] else "N/A"
        duration = f"{r['duration']:.3f}" if r['duration'] else "N/A"
        print(f"{section:<{name_col_width}}  {start_dt:>15}   {end_dt:>15}   {duration:>14}")
    
    print("-" * (name_col_width + 50))
    print(f"{'Total Runtime':<{name_col_width}}                       {total_time:.3f} seconds\n")

CELL 3 - The Horizons API system so that all the queries that i need are actually parsed through for jupiters entire flow

In [None]:
#Cell 3 - horizons api

def create_input_content(dateobs, timeobs):
    return f"""
    !$$SOF
    COMMAND='599'
    OBJ_DATA='YES'
    MAKE_EPHEM='YES'
    TABLE_TYPE='OBSERVER'
    CENTER='500@399'
    TLIST='{dateobs} {timeobs}'
    QUANTITIES='9,20,23,24'
    !$$EOF
    """

def parse_horizons_text_for_delta_and_sbrt(horizons_text):
    lines = horizons_text.splitlines()
    
    try:
        start_index = next(i for i, line in enumerate(lines) if '$$SOE' in line)
        end_index = next(i for i, line in enumerate(lines) if '$$EOE' in line)
    except StopIteration:
        print("Could not find $$SOE/$$EOE in Horizons output.")
        return None, None

    ephem_lines = lines[start_index+1 : end_index]

    for ln in ephem_lines:
        ln = ln.strip()
        if not ln:
            continue
        tokens = ln.split()

        if len(tokens) < 9:
            continue
        try:
            s_brt = float(tokens[3])
            dist_au = float(tokens[4])
            return dist_au, s_brt
        except ValueError:
            continue

    return None, None

def get_horizons_data(dateobs, timeobs):
    content = create_input_content(dateobs, timeobs)
    
    resp = requests.post(
        horizons_url,
        data={'format': 'text'},
        files={'input': ('input.txt', content)}
    )

    if resp.status_code != 200:
        print(f"Failed Horizons request: {resp.status_code}")
        return None, None
    
    return parse_horizons_text_for_delta_and_sbrt(resp.text)


CELL 4 - the tools needed for python to "see the image" and perform apperture makes and models on it all

In [None]:
# Cell 4 : Image Processing Utilities

def elliptical_mask(shape, center, major_axis, minor_axis, angle_deg):
    (h, w) = shape
    y_grid, x_grid = np.ogrid[0:h, 0:w]
    cx, cy = center
    theta = np.deg2rad(angle_deg)
    a = major_axis / 2.0
    b = minor_axis / 2.0
    x_shifted = x_grid - cx
    y_shifted = y_grid - cy
    cos_t = np.cos(theta)
    sin_t = np.sin(theta)
    x_prime = x_shifted * cos_t + y_shifted * sin_t
    y_prime = -x_shifted * sin_t + y_shifted * cos_t
    return (x_prime**2) / (a * a) + (y_prime**2) / (b * b) <= 1.0

def rotate_image_full(data, cx, cy, angle_deg):
    rows, cols = data.shape
    M = cv2.getRotationMatrix2D((cx, cy), angle_deg, 1.0)
    return cv2.warpAffine(data, M, (cols, rows), flags=cv2.INTER_LINEAR)


CELL 5 - the main code, not much to be said this one does way too much stuff to even begin listing

In [None]:
# cell 5 - main code

start_section("Script Start")
start_section("Glob FITS files")
fits_files = glob.glob("jupiter_data_WFC3_F631DRC/MAST_2023_08_29T0311/HST/*/*.fits")
#fits_files = glob.glob("icpf19f7q_drc.fits")
end_section()

if len(fits_files) == 0:
    print("No FITS files found in the specified directory.")
    # No end_section() here if we didn't start processing
    print_time_summary() # Print whatever timing we have so far
    # Exit or handle appropriately if no files found, added guard below
else:
    print(f"Found {len(fits_files)} FITS files.\n")

    file_details = []
    start_section("Reading FITS metadata")

    for fits_file in fits_files:
        try:
            with fits.open(fits_file) as hdul:
                header = hdul[0].header
                raw_data = hdul[1].data
                if raw_data is None:
                    data = np.zeros((1, 1), dtype=np.float32)
                else:
                    # Ensure raw_data is treated as float before nan_to_num
                    data = np.nan_to_num(raw_data.astype(np.float64), nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
                total_flux = float(np.sum(data))
                file_name = os.path.basename(fits_file)
                date_obs = header.get('DATE-OBS', 'N/A')
                time_obs = header.get('TIME-OBS', 'N/A')
                telescope = header.get('TELESCOP', 'N/A')
                instrument = header.get('INSTRUME', 'N/A')
                exposure_time = header.get('EXPTIME', 'N/A')
                t_filter = header.get('FILTER', 'N/A')
                file_details.append((file_name, data, total_flux, date_obs, time_obs, telescope, instrument, exposure_time, t_filter))
        except Exception as e:
            print(f"Error processing file {fits_file}: {e}")
    end_section()

    start_section("Sort file_details")
    file_details = sorted(file_details, key=lambda x: (x[3], x[4]))
    end_section()

    start_section("Processing images")
    ellipse_summaries = []
    cropped_images = [] # Keep original list for Cell 11 etc.
    image_storage = []

    # ---- ADDED: Initialize lists for NumPy arrays ----
    all_param_tables = []
    all_normalized_images = []
    all_rotated_images = []
    all_contour_images = []
    all_cropped_images_for_npy = []
    # ---- END ADDED ----

    for i, (file_name, data, total_flux, date_obs, time_obs, telescope, instrument, exposure_time, t_filter) in enumerate(file_details, start=1):
        start_section(f"Process {file_name}")
        dist_au = None
        s_brt = None
        if date_obs != 'N/A' and time_obs != 'N/A':
            dist_au, s_brt_val = get_horizons_data(date_obs, time_obs)
            if s_brt_val:
                s_brt = s_brt_val

        # Initialize image variables for this iteration
        norm_8u_orig = np.zeros((1, 1), dtype=np.uint8)
        norm_8u_rot = np.zeros((1, 1), dtype=np.uint8)
        overlay_img = np.zeros((1, 1, 3), dtype=np.uint8) # Assuming BGR for overlay
        norm_8u_crop = None # Initialize as None
        cropped_data = np.array([]) # Initialize as empty

        if data.size > 1:
            high_cut_orig = np.percentile(data[np.isfinite(data)], 99) if np.any(np.isfinite(data)) else 0
            clipped_orig = np.clip(data, 0, high_cut_orig)
            # Check if clipped_orig has valid range before normalizing
            min_val, max_val = clipped_orig.min(), clipped_orig.max()
            if max_val > min_val:
                 norm_8u_orig = cv2.normalize(clipped_orig, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
            elif max_val > 0: # Handle case where image is flat but not zero
                 norm_8u_orig = np.full(clipped_orig.shape, 255, dtype=np.uint8)
            else: # Handle case where image is all zero
                 norm_8u_orig = np.zeros(clipped_orig.shape, dtype=np.uint8)

        rotate_angle = 0.0
        # Ensure data has dimensions before accessing shape
        if data.ndim >= 2:
            cx, cy = (data.shape[1] / 2.0, data.shape[0] / 2.0)

            tmp_thresh_val = 5
            # Ensure norm_8u_orig is valid before thresholding
            if norm_8u_orig.size > 1:
                _, tmp_thresh = cv2.threshold(norm_8u_orig, tmp_thresh_val, 255, cv2.THRESH_BINARY)
                tmp_contours, _ = cv2.findContours(tmp_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                if len(tmp_contours) > 0:
                    largest_tmp_contour = max(tmp_contours, key=cv2.contourArea)
                    if len(largest_tmp_contour) >= 5:
                        (temp_cx, temp_cy), (w, h), angle = cv2.fitEllipse(largest_tmp_contour)
                        # Ensure dimensions are valid before comparison
                        if w > 0 and h > 0:
                            if h > w:
                                angle += 90
                                w, h = h, w # Swap width and height for consistency if needed
                        rotate_angle = angle

            # Rotate original high-bit data
            rotated_data = rotate_image_full(data, cx, cy, rotate_angle)
            rotated_flux = float(np.sum(rotated_data))

            # Normalize rotated data
            high_cut_rot = np.percentile(rotated_data[np.isfinite(rotated_data)], 99) if np.any(np.isfinite(rotated_data)) else 0
            clipped_rot = np.clip(rotated_data, 0, high_cut_rot)
            min_val_rot, max_val_rot = clipped_rot.min(), clipped_rot.max()
            if max_val_rot > min_val_rot:
                 norm_8u_rot = cv2.normalize(clipped_rot, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
            elif max_val_rot > 0:
                 norm_8u_rot = np.full(clipped_rot.shape, 255, dtype=np.uint8)
            else:
                 norm_8u_rot = np.zeros(clipped_rot.shape, dtype=np.uint8)


            # Prepare overlay image (ensure it's 3-channel BGR)
            if norm_8u_rot.ndim == 2:
                overlay_img = cv2.cvtColor(norm_8u_rot, cv2.COLOR_GRAY2BGR)
            elif norm_8u_rot.ndim == 3: # Should already be BGR if loaded from file, but check anyway
                overlay_img = norm_8u_rot
            else: # Fallback for unexpected dimensions
                overlay_img = np.zeros((norm_8u_rot.shape[0], norm_8u_rot.shape[1], 3), dtype=np.uint8)

            threshold_value = 5
            _, thr_rot = cv2.threshold(norm_8u_rot, threshold_value, 255, cv2.THRESH_BINARY)
            rot_contours, _ = cv2.findContours(thr_rot, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        else: # Handle case where initial data was not valid
             rotated_data = data
             rotated_flux = total_flux
             norm_8u_rot = norm_8u_orig # Will be zeros
             overlay_img = cv2.cvtColor(norm_8u_rot, cv2.COLOR_GRAY2BGR)
             rot_contours = []


        ellipse_flux = 0.0
        cropped_flux = 0.0
        flux_avg = 0.0
        ef_err = 0.0
        fa_err = 0.0

        if len(rot_contours) > 0:
            rot_largest_contour = max(rot_contours, key=cv2.contourArea)
            if len(rot_largest_contour) >= 5:
                (rcx, rcy), (rw, rh), rangle = cv2.fitEllipse(rot_largest_contour)
                # Check for valid ellipse dimensions
                if rw > 0 and rh > 0:
                    if rh > rw:
                        rangle += 90
                        rw, rh = rh, rw # Swap if needed
                    # Ensure mask dimensions match rotated_data
                    if rotated_data.shape[0] > 0 and rotated_data.shape[1] > 0:
                        mask_ell = elliptical_mask(rotated_data.shape, (rcx, rcy), rw, rh, 0) # Use 0 angle for mask if ellipse was already rotated
                        # Ensure mask is boolean before indexing
                        mask_ell = mask_ell.astype(bool)
                        if mask_ell.shape == rotated_data.shape:
                             ellipse_flux = float(np.sum(rotated_data[mask_ell]))
                        else:
                             print(f"Warning: Mask shape {mask_ell.shape} mismatch with data shape {rotated_data.shape} for {file_name}")

                    # Draw ellipse on overlay_img (ensure overlay_img is valid)
                    if overlay_img.size > 1:
                         # Ensure ellipse parameters are integers/floats as expected by cv2.ellipse
                         center_pt = (int(round(rcx)), int(round(rcy)))
                         axes_len = (int(round(rw)), int(round(rh)))
                         cv2.ellipse(overlay_img, (center_pt, axes_len, 0), (0, 255, 0), 2) # Use 0 angle here
                else:
                    print(f"Warning: Invalid ellipse dimensions (rw={rw}, rh={rh}) found for {file_name}")


            # Bounding Rect and Cropping
            rx, ry, rW, rH = cv2.boundingRect(rot_largest_contour)
            # Ensure crop dimensions are valid and within bounds
            if rW > 0 and rH > 0 and ry+rH <= rotated_data.shape[0] and rx+rW <= rotated_data.shape[1]:
                cropped_data = rotated_data[ry:ry + rH, rx:rx + rW]
                cropped_flux = float(np.sum(cropped_data))
                # Draw rectangle on overlay_img
                if overlay_img.size > 1:
                     cv2.rectangle(overlay_img, (rx, ry), (rx + rW, ry + rH), (255, 0, 0), 2)
            else:
                print(f"Warning: Invalid bounding rect dimensions or out of bounds for {file_name}. Cropping skipped.")
                cropped_data = np.array([]) # Reset cropped_data if invalid


            # Calculate errors and average flux
            ef_err = np.sqrt(ellipse_flux) if ellipse_flux > 0 else 0.0
            # Ensure exposure_time is valid number before division
            valid_exptime = False
            try:
                exp_time_float = float(exposure_time)
                if exp_time_float > 0:
                    valid_exptime = True
            except (ValueError, TypeError):
                valid_exptime = False

            if valid_exptime and dist_au is not None:
                flux_avg = ellipse_flux / (exp_time_float * dist_au)
                fa_err = ef_err / (exp_time_float * dist_au)

        ellipse_summaries.append((i, file_name, s_brt, ellipse_flux, dist_au, ef_err, flux_avg, fa_err, date_obs, time_obs))

        param_values = [
            ("File #", str(i)),
            ("File Name", file_name),
            ("DATE-OBS", date_obs),
            ("TIME-OBS", time_obs),
            ("Telescope", telescope),
            ("Instrument", instrument),
            ("Filter", t_filter),
            ("surface brightness (Jupiter)", f"{s_brt:.3f}" if s_brt is not None else "N/A"),
            ("Total Flux", f"{total_flux:.2f}"),
            ("Flux (Rotated)", f"{rotated_flux:.2f}"),
            ("Ellipse Flux", f"{ellipse_flux:.2f}"),
            ("Cropped Flux", f"{cropped_flux:.2f}" if cropped_data.size > 0 else "N/A"),
            ("Exposure Time", f"{exposure_time}"),
            ("Delta (AU)", f"{dist_au:.5f}" if dist_au is not None else "N/A"),
            ("Flux Average", f"{flux_avg:.2f}" if flux_avg != 0.0 else "N/A"), # Display N/A if flux_avg is 0
            ("Ellipse Flux Err", f"{ef_err:.2f}"),
            ("Flux Avg Err", f"{fa_err:.2f}")
        ]
        print("\n" + "=" * 60)
        print(f"  Parameter-Value Table for {file_name}")
        print("=" * 60)

        col_width = 35

        for (p, v) in param_values:
            print(f"{p:<{col_width}} {v}")
        print("=" * 60)

        fig, axes = plt.subplots(1, 4, figsize=(22, 5))
        # Ensure images are valid before showing
        if norm_8u_orig.size > 1:
             axes[0].imshow(norm_8u_orig, cmap='gray', origin='lower')
        axes[0].set_title(f"Normalized 8-bit\n(Total Flux={total_flux:.2f})", fontsize=10)
        axes[0].axis('off')

        if norm_8u_rot.size > 1:
             axes[1].imshow(norm_8u_rot, cmap='gray', origin='lower')
        axes[1].set_title(f"Rotated Angle={rotate_angle:.1f}°\n(Flux={rotated_flux:.2f})", fontsize=10)
        axes[1].axis('off')

        if overlay_img.size > 1:
             # OpenCV loads as BGR, Matplotlib expects RGB
             axes[2].imshow(cv2.cvtColor(overlay_img, cv2.COLOR_BGR2RGB), origin='lower')
        axes[2].set_title(f"Contour+Ellipse\n(EllipseFlux={ellipse_flux:.2f})", fontsize=10)
        axes[2].axis('off')

        if cropped_data.size > 1:
            # Normalize cropped data for display
            high_cut_crop = np.percentile(cropped_data[np.isfinite(cropped_data)], 99) if np.any(np.isfinite(cropped_data)) else 0
            clipped_crop = np.clip(cropped_data, 0, high_cut_crop)
            min_val_crop, max_val_crop = clipped_crop.min(), clipped_crop.max()
            if max_val_crop > min_val_crop:
                 norm_8u_crop = cv2.normalize(clipped_crop, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
            elif max_val_crop > 0:
                 norm_8u_crop = np.full(clipped_crop.shape, 255, dtype=np.uint8)
            else:
                 norm_8u_crop = np.zeros(clipped_crop.shape, dtype=np.uint8)

            axes[3].imshow(norm_8u_crop, cmap='gray', origin='lower')
            axes[3].set_title(f"Cropped\n(Flux={cropped_flux:.2f})", fontsize=10)
            axes[3].axis('off')

            # Append to original list for other cells
            cropped_images.append((i, file_name, date_obs, time_obs, norm_8u_crop))
        else:
            # If no cropped data, show the rotated image again or blank
            if norm_8u_rot.size > 1:
                 axes[3].imshow(norm_8u_rot, cmap='gray', origin='lower')
            axes[3].set_title("Cropped\n(No contour/crop)", fontsize=10)
            axes[3].axis('off')
            # Append None to keep lists aligned for other cells that use it
            cropped_images.append((i, file_name, date_obs, time_obs, None))
            norm_8u_crop = None # Ensure it remains None for image_storage

        plt.suptitle(f"File: {file_name}", fontsize=12)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout to prevent title overlap
        plt.show()

        image_storage.append({
            "index": i,
            "file_name": file_name,
            "date_obs": date_obs,
            "time_obs": time_obs,
            "orig_image": norm_8u_orig if norm_8u_orig.size > 1 else None,
            "rotated_image": norm_8u_rot if norm_8u_rot.size > 1 else None,
            "contour_image": overlay_img if overlay_img.size > 1 else None,
            "cropped_image": norm_8u_crop if norm_8u_crop is not None and norm_8u_crop.size > 1 else None
        })

        # ---- ADDED: Append data to lists for NumPy arrays ----
        all_param_tables.append(param_values)
        all_normalized_images.append(norm_8u_orig if norm_8u_orig.size > 1 else None)
        all_rotated_images.append(norm_8u_rot if norm_8u_rot.size > 1 else None)
        all_contour_images.append(overlay_img if overlay_img.size > 1 else None)
        all_cropped_images_for_npy.append(norm_8u_crop if norm_8u_crop is not None and norm_8u_crop.size > 1 else None)
        # ---- END ADDED ----

        end_section() # End processing for this file

    # ---- ADDED: Save collected data as NumPy arrays ----
    start_section("Saving NumPy arrays")
    output_dir = "arrays"
    os.makedirs(output_dir, exist_ok=True)
    print(f"\nSaving NumPy arrays to '{output_dir}' directory...")

    try:
        # Save Parameter Tables (as object array since it contains mixed types/strings)
#        np.save(os.path.join(output_dir, "parameter_tables.npy"), np.array(all_param_tables, dtype=object))
#        print("- Saved parameter_tables.npy")

#        # Save Image Arrays (using object dtype to handle potential None values or shape variations)
#        np.save(os.path.join(output_dir, "normalized_images.npy"), np.array(all_normalized_images, dtype=object))
#        print("- Saved normalized_images.npy")

#        np.save(os.path.join(output_dir, "rotated_images.npy"), np.array(all_rotated_images, dtype=object))
#        print("- Saved rotated_images.npy")

#        np.save(os.path.join(output_dir, "contour_ellipse_images.npy"), np.array(all_contour_images, dtype=object))
#        print("- Saved contour_ellipse_images.npy")

#        np.save(os.path.join(output_dir, "cropped_images.npy"), np.array(all_cropped_images_for_npy, dtype=object))
#        print("- Saved cropped_images.npy")

        print("NumPy array saving complete.")

    except Exception as e:
        print(f"Error saving NumPy arrays: {e}")
    end_section() # End saving arrays
    # ---- END ADDED ----

    end_section() # End "Processing images" section
    end_section() # End "Script Start" section (implicitly ends the last open section)

# Always print time summary, even if no files were found initially
print_time_summary()


Cell 5 parallelized much faster, only works when parallel_worker.py is in the same directory

In [None]:
# ========== Parallelized Cell 5 (Notebook Version - Reduced Workers & Optional Saves) ==========

# Ensure necessary imports from previous cells are available:
# Make sure these are run in cells ABOVE this one:
import numpy as np
# import pandas as pd              # Not strictly needed in *this* cell if helpers are moved
# import matplotlib.pyplot as plt  # Plotting is removed from parallel part
import os
import glob
import time
# from datetime import datetime   # Not directly used here, maybe by timing funcs?
# import requests                # Used by helpers in worker file
# import math                    # Used by helpers in worker file
from astropy.io import fits     # Needed for initial metadata read
# import cv2                     # Used by helpers in worker file
# Helper functions assumed to be defined in previous cells:
# start_section, end_section, print_time_summary

# Import the parallelization library and the worker function
import concurrent.futures
import traceback # For detailed error printing if needed

# **** IMPORT THE WORKER FUNCTION FROM THE .py FILE ****
try:
    from parallel_worker import process_single_file
    print("Successfully imported 'process_single_file' from parallel_worker.py")
except ImportError as e:
    print(f"ERROR: Could not import 'process_single_file' from parallel_worker.py.")
    print(f"Please ensure 'parallel_worker.py' exists in the same directory as the notebook.")
    print(f"Import Error: {e}")
    # Optionally raise the error or exit if the import fails
    raise e

# --- Main Execution Block ---
start_section("Script Start")

# --- Serial Part 1: File Discovery and Metadata Reading ---
start_section("Glob FITS files")
# Ensure the path is correct for your system
fits_files_pattern = "jupiter_data_WFC3_F631DRC/MAST_2023_08_29T0311/HST/*/*.fits"
# fits_files_pattern = "icpf19f7q_drc.fits" # Example for single file testing
fits_files = glob.glob(fits_files_pattern)
end_section()

if len(fits_files) == 0:
    print(f"No FITS files found matching pattern: '{fits_files_pattern}'")
    print_time_summary()
else:
    print(f"Found {len(fits_files)} FITS files.\n")

    file_details = []
    start_section("Reading FITS metadata")
    processed_filenames = set() # Keep track of files already added

    for fits_file_path in fits_files:
        file_name = os.path.basename(fits_file_path)
        # Skip if duplicate filename encountered (can happen with glob patterns sometimes)
        if file_name in processed_filenames:
             print(f"Skipping duplicate file: {file_name}")
             continue

        try:
            # Use memory mapping for potentially large files (optional)
            with fits.open(fits_file_path, memmap=True) as hdul:
                try:
                     header = hdul[0].header
                     # Data usually in HDU 1 for HST WFC3/UVIS DRC files
                     raw_data = hdul[1].data
                     if raw_data is None: # Check if data exists in HDU 1
                          print(f"Warning: No data found in HDU 1 of {fits_file_path}, trying HDU 0.")
                          raw_data = hdul[0].data # Fallback to HDU 0

                except IndexError:
                     print(f"Warning: Could not access HDU 1 in {fits_file_path}. Trying HDU 0.")
                     header = hdul[0].header
                     raw_data = hdul[0].data


                if raw_data is None:
                    print(f"Warning: No data found in {fits_file_path}. Skipping.")
                    continue # Skip this file if no data found

                # --- Data conversion happens here before passing to worker ---
                data = np.nan_to_num(raw_data.astype(np.float64), nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
                total_flux = float(np.sum(data))
                # --- End data conversion ---

                date_obs = header.get('DATE-OBS', 'N/A')
                time_obs = header.get('TIME-OBS', 'N/A')
                telescope = header.get('TELESCOP', 'N/A')
                instrument = header.get('INSTRUME', 'N/A')
                exposure_time = header.get('EXPTIME', 'N/A')
                t_filter = header.get('FILTER', 'N/A')

                # Append tuple: (file_name, data_array, total_flux, date, time, tel, inst, exptime, filt)
                file_details.append((file_name, data, total_flux, date_obs, time_obs, telescope, instrument, exposure_time, t_filter))
                processed_filenames.add(file_name) # Mark as processed

        except FileNotFoundError:
             print(f"Error: FITS file not found at {fits_file_path}. Skipping.")
        except OSError as e_os:
             print(f"Error reading file {fits_file_path} (OSError: {e_os}). Skipping.")
        except Exception as e:
            print(f"Error reading metadata/data from file {fits_file_path}: {e}")
            traceback.print_exc()
    end_section() # End Reading FITS metadata

    # Check if any file details were successfully read
    if not file_details:
         print("No file details could be read. Exiting processing.")
    else:
        # --- Serial Part 2: Sorting ---
        start_section("Sort file_details")
        file_details = sorted(file_details, key=lambda x: (x[3], x[4]))
        end_section() # End Sort file_details

        # --- Parallel Part: Processing Images ---
        start_section("Processing images (Parallel)")

        ellipse_summaries = []
        image_storage = []
        all_param_tables = []
        all_normalized_images = []
        all_rotated_images = []
        all_contour_images = []
        all_cropped_images_for_npy = []
        cropped_images = [] # For compatibility

        # Prepare arguments: Tuple of (index, detail_tuple) for each file
        worker_args = list(enumerate(file_details, start=1))

        results = []
        # *** REDUCED NUMBER OF WORKERS TO MITIGATE 503 ERRORS ***
        # Adjust this number based on testing (e.g., 4, 3, 2)
        max_workers_to_use = 4
        # Ensure we don't request more workers than available cores or tasks
        max_workers_to_use = min(max_workers_to_use, os.cpu_count() if os.cpu_count() else 1, len(worker_args))
        print(f"Using up to {max_workers_to_use} worker processes.")

        with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers_to_use) as executor:
            print(f"Submitting {len(worker_args)} file processing tasks...")
            # Map the imported worker function
            results = list(executor.map(process_single_file, worker_args))
            print("All parallel processing tasks complete.")

        end_section() # End Processing images (Parallel)

        # --- Serial Part 3: Aggregating Results and Saving ---
        start_section("Aggregating results & Saving NumPy arrays")
        print("Aggregating results...")
        successful_results = 0
        failed_files = []

        for i, result in enumerate(results):
            original_args = worker_args[i]
            original_index = original_args[0]
            original_filename = original_args[1][0]

            if result is not None:
                successful_results += 1
                (idx, ellipse_summary, img_storage_entry, param_values,
                 norm_orig, norm_rot, overlay, norm_crop_npy) = result

                ellipse_summaries.append(ellipse_summary)
                image_storage.append(img_storage_entry)
                all_param_tables.append(param_values)
                all_normalized_images.append(norm_orig)
                all_rotated_images.append(norm_rot)
                all_contour_images.append(overlay)
                all_cropped_images_for_npy.append(norm_crop_npy)
                cropped_images.append((idx, img_storage_entry["file_name"],
                                       img_storage_entry["date_obs"], img_storage_entry["time_obs"],
                                       norm_crop_npy))
            else:
                print(f"Result aggregation skipped for task index {i} (File original index: {original_index}, Name: {original_filename}) due to worker error.")
                failed_files.append(original_filename)
                # Optionally append placeholders if needed by subsequent cells
                # ellipse_summaries.append(None)
                # ... etc ...


        print(f"Successfully aggregated results for {successful_results} out of {len(results)} tasks.")
        if failed_files:
             print(f"Failed files: {failed_files}")

        # Save collected data as NumPy arrays (only if there are results)
        if successful_results > 0:
            output_dir = "arrays"
            # *** OPTIONAL: Change this path if saving to a different drive ***
            # output_dir = "/path/to/larger/disk/arrays"
            os.makedirs(output_dir, exist_ok=True)
            print(f"\nSaving NumPy arrays to '{output_dir}' directory...")
            try:
                # --- Save Parameter Tables (Usually essential) ---
                # np.save(os.path.join(output_dir, "parameter_tables.npy"), np.array(all_param_tables, dtype=object))
                print("- Saved parameter_tables.npy")

                # *** OPTIONAL SAVES: Comment out lines below to save disk space ***

                # np.save(os.path.join(output_dir, "normalized_images.npy"), np.array(all_normalized_images, dtype=object))
                # print("- Saved normalized_images.npy")

                # np.save(os.path.join(output_dir, "rotated_images.npy"), np.array(all_rotated_images, dtype=object))
                # print("- Saved rotated_images.npy")

                # np.save(os.path.join(output_dir, "contour_ellipse_images.npy"), np.array(all_contour_images, dtype=object))
                # print("- Saved contour_ellipse_images.npy")

                # --- Save Cropped Images (Often essential for later steps) ---
                # np.save(os.path.join(output_dir, "cropped_images.npy"), np.array(all_cropped_images_for_npy, dtype=object))
                print("- Saved cropped_images.npy")

                # *** End Optional Saves ***

                print("NumPy array saving complete.")
            except OSError as e_os:
                 # Specifically catch disk space error
                 if e_os.errno == 28: # Errno 28: No space left on device
                     print(f"\nError saving NumPy arrays: [Errno 28] No space left on device.")
                     print("Please free up disk space or comment out optional saves in the script.")
                 else:
                     print(f"Error saving NumPy arrays (OS Error): {e_os}")
                 traceback.print_exc()
            except Exception as e:
                print(f"Error saving NumPy arrays: {e}")
                traceback.print_exc()
        else:
             print("Skipping NumPy array saving as no results were successfully processed.")


        end_section() # End Aggregating results & Saving NumPy arrays

end_section() # End Script Start section

# Always print time summary
print_time_summary()

# --- End of Parallelized Cell 5 ---

Cell 5a and 5b to look at arrays to ensure proper loading and file output 

In [None]:
import numpy as np
import os

# Define the path to the .npy file
file_path = os.path.join("arrays", "parameter_tables.npy")

# Check if the file exists
if not os.path.exists(file_path):
    print(f"Error: File not found at '{file_path}'")
    print("Please ensure you have run Cell 5 and the 'arrays' folder exists with the .npy file.")
else:
    try:
        # Load the NumPy array, allowing pickles is necessary for object arrays
        all_tables = np.load(file_path, allow_pickle=True)

        print(f"Successfully loaded {len(all_tables)} parameter tables from '{file_path}'.\n")

        # Iterate through each table (corresponding to each processed FITS file)
        for i, table_data in enumerate(all_tables):
            print(f"--- Parameter Table for File {i+1} ---")

            # Check if table_data is a list or similar iterable
            if hasattr(table_data, '__iter__'):
                 # Iterate through the (parameter_name, parameter_value) tuples in the table
                for param_name, param_value in table_data:
                    print(f"{param_name}: {param_value}")
            else:
                 print("Unexpected data format in table:", table_data) # Handle unexpected format

            print("-" * (len(f"--- Parameter Table for File {i+1} ---") + 1)) # Print separator
            print() # Add a blank line for readability

    except Exception as e:
        print(f"An error occurred while loading or reading the file: {e}")
        

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2 # Still potentially useful if any images were unexpectedly saved in color

# Define the path to the .npy file
file_path = os.path.join("arrays", "cropped_images.npy") # Corrected path based on prompt

# Check if the file exists
if not os.path.exists(file_path):
    print(f"Error: File not found at '{file_path}'")
    print("Please ensure you have run Cell 5 and the 'arrays' folder exists with the .npy file.")
else:
    try:
        # Load the NumPy array, allowing pickles is necessary for object arrays
        all_images = np.load(file_path, allow_pickle=True)

        print(f"Successfully loaded {len(all_images)} images from '{file_path}'.\n")
        print("Displaying images one by one in grayscale...")

        # Iterate through each image array in the loaded data
        for i, img_data in enumerate(all_images):

            print(f"\n--- Cropped Image {i+1} ---") # Updated title indicator

            # Check if the loaded item is actually a numpy array (an image)
            if isinstance(img_data, np.ndarray) and img_data.size > 1:
                try:
                    # Initialize img_to_display
                    img_to_display = None

                    # Check dimensions - expecting grayscale (2D)
                    if img_data.ndim == 2:
                        # It's already grayscale
                        img_to_display = img_data
                        print("(Image is 2D, displaying as grayscale)")
                    elif img_data.ndim == 3 and img_data.shape[2] == 3:
                        # Unexpected 3-channel image, convert to grayscale
                        img_to_display = cv2.cvtColor(img_data, cv2.COLOR_BGR2GRAY)
                        print("(Image was 3-channel, converted to grayscale for display)")
                    elif img_data.ndim == 3 and img_data.shape[2] == 1:
                         # Grayscale but with an extra dimension, squeeze it
                         img_to_display = np.squeeze(img_data, axis=2)
                         print("(Image had singleton 3rd dimension, squeezed to 2D)")
                    else:
                        print(f"(Image has unexpected shape {img_data.shape}, skipping display)")
                        continue # Skip to next image if shape is wrong

                    # Display the grayscale image
                    plt.figure(figsize=(6, 6)) # Create a new figure for each image
                    # Use cmap='gray' to ensure grayscale display
                    plt.imshow(img_to_display, cmap='gray', origin='lower')
                    plt.title(f"Cropped Image #{i+1}") # Updated title
                    plt.axis('off') # Hide axes
                    plt.show() # Display the current image
                    print("Displayed.")

                except Exception as display_error:
                     print(f"Error displaying image #{i+1}: {display_error}")

            elif img_data is None:
                print("Image data is None (likely skipped during processing).")
            else:
                print(f"Item #{i+1} is not a valid image array. Data: {type(img_data)}")


    except Exception as e:
        print(f"An error occurred while loading or processing the file: {e}")

print("\nScript finished.")

CELL 6 - this block makes sure that the right value is actually being used in the outputs above for the values of things that i need to be seeing, if needed to run, just uncomment it

In [None]:
'''

# TEST CASE Cell 6 : Debug / Test Horizons

def test_horizons_api(file_index=0):
    if not file_details:
        print("No file_details found. Make sure you've run the main analysis cell(s).")
        return

    if file_index < 0 or file_index >= len(file_details):
        print(f"Invalid file_index {file_index}; valid range is [0..{len(file_details)-1}]")
        return

    (
        file_name, data, total_flux, date_obs, time_obs,
        telescope, instrument, exposure_time, t_filter
    ) = file_details[file_index]

    print("=== Selected FITS File Information ===")
    print(f"Index:         {file_index}")
    print(f"File Name:     {file_name}")
    print(f"DATE-OBS:      {date_obs}")
    print(f"TIME-OBS:      {time_obs}")
    print(f"Telescope:     {telescope}")
    print(f"Instrument:    {instrument}")
    print(f"Filter:        {t_filter}")
    print(f"Exposure Time: {exposure_time}")
    print("======================================\n")

    horizons_input = create_input_content(date_obs, time_obs)

    print("=== Content Sent to Horizons API ===")
    print(horizons_input)
    print("=====================================\n")

    import requests

    horizons_url_debug = 'https://ssd.jpl.nasa.gov/api/horizons_file.api'
    
    debug_response = requests.post(
        horizons_url_debug,
        data={'format': 'text'},
        files={'input': ('input.txt', horizons_input)}
    )

    if debug_response.status_code != 200:
        print(f"Failed Horizons request: {debug_response.status_code}")
        return

    horizons_full_text = debug_response.text

    print("=== Full Horizons API Response ===")
    print(horizons_full_text)
    print("==================================\n")

    lines = horizons_full_text.splitlines()
    
    try:
        start_index = next(i for i, line in enumerate(lines) if '$$SOE' in line)
        end_index = next(i for i, line in enumerate(lines) if '$$EOE' in line)
    except StopIteration:
        print("Could not find $$SOE/$$EOE in Horizons output—parsing aborted.")
        return

    ephem_lines = lines[start_index+1 : end_index]

    dist_au_extracted = None
    s_brt_extracted = None
    relevant_line = None

    for ln in ephem_lines:
        ln = ln.strip()
        if not ln:
            continue
        tokens = ln.split()
        relevant_line = ln
        try:
            s_brt_extracted = float(tokens[3])
            dist_au_extracted = float(tokens[4])
        except:
            pass
        break

    print("=== Parsing Ephemeris Lines ===")
    if relevant_line:
        print(f"Line used for parsing:\n{relevant_line}\n")
        print(f"Extracted distance (delta) [OLD WAY]: {dist_au_extracted}")
        print(f"Extracted surface brightness (S-brt) [OLD WAY]: {s_brt_extracted}")
    else:
        print("No valid line found with ephemeris data.")
    print("================================\n")

    debug_delta, debug_sbrt = get_horizons_data(date_obs, time_obs)

    print("=== Comparison with get_horizons_data() ===")
    print(f"[CORRECT] get_horizons_data returned delta = {debug_delta}, s_brt = {debug_sbrt}")
    print("===========================================\n")

test_horizons_api(file_index=0)

'''

Cell 7 - Final Data Summaries & Plots given in a whole table thats easy to read

In [None]:
# Cell 7 : Final Data Summaries & Plots

def print_dynamic_table(rows):
    str_rows = [[str(item) for item in row] for row in rows]
    num_cols = len(str_rows[0])
    col_widths = [max(len(row[i]) for row in str_rows) for i in range(num_cols)]
    
    header_row = str_rows[0]
    sep_line = ["-" * col_widths[i] for i in range(num_cols)]
    
    print("  ".join(cell.center(col_widths[i]) for i, cell in enumerate(header_row)))
    print("  ".join(sep_line))
    
    for row in str_rows[1:]:
        print("  ".join(cell.center(col_widths[i]) for i, cell in enumerate(row)))

table_rows = [
    ["File #", "File Name", "SB (Jupiter)", "Ellipse Flux", "delta",
     "Error Bars EF", "Flux Average", "Error Bars FA", "DATE-OBS", "TIME-OBS"]
]

for (file_num, f_name, s_brt, e_flux, delta, ef_err, f_avg, fa_err, d_obs, t_obs) in ellipse_summaries:
    s_brt_str = f"{s_brt:.3f}" if s_brt else "N/A"
    e_flux_str = f"{e_flux:.2f}" if e_flux else "N/A"
    delta_str = f"{delta:.5f}" if delta else "N/A"
    ef_err_str = f"{ef_err:.2f}" if ef_err else "N/A"
    f_avg_str = f"{f_avg:.3f}" if f_avg else "N/A"
    fa_err_str = f"{fa_err:.3f}" if fa_err else "N/A"

    new_row = [
        str(file_num), f_name, s_brt_str, e_flux_str, delta_str,
        ef_err_str, f_avg_str, fa_err_str, d_obs, t_obs
    ]
    
    table_rows.append(new_row)

print_dynamic_table(table_rows)

CELL 8 - The surface brightness and the delta of the sun over the course of 5 years to use in the upcoming graphs

In [None]:
# Cell 8 : The sun graph (Modified for dot markers for apparent magnitude)

url = "https://ssd.jpl.nasa.gov/api/horizons_file.api"

input_content = """
!$$SOF
COMMAND='10'
CENTER='500@-48'
MAKE_EPHEM='YES'
EPHEM_TYPE='OBSERVER'
START_TIME='2015-01-19 00:00:00'
STOP_TIME='2019-07-21 23:59:00'
STEP_SIZE='60m'
QUANTITIES='9,20'
OUT_UNITS='KM-S'
CSV_FORMAT='NO'
!$$EOF
"""

params = {"format": "text"}

response = requests.post(url, data=params, files={'input': ('input.txt', input_content)})

lines = response.text.splitlines()

start_idx, end_idx = None, None
for i, line in enumerate(lines):
    if "$$SOE" in line:
        start_idx = i
    if "$$EOE" in line:
        end_idx = i
        break

timestamps, apmag_vals, delta_vals = [], [], []

if start_idx is not None and end_idx is not None:
    for line in lines[start_idx+1:end_idx]:
        line = line.strip()
        if not line:
            continue
        tokens = line.split()
        if len(tokens) >= 5:
            dt_str = tokens[0] + " " + tokens[1]
            try:
                dt = pd.to_datetime(dt_str)
                apmag = float(tokens[2])
                delta = float(tokens[4])
                timestamps.append(dt)
                apmag_vals.append(apmag)
                delta_vals.append(delta)
            except:
                continue

df = pd.DataFrame({
    "datetime": timestamps,
    "apmag": apmag_vals,
    "delta": delta_vals
})

df.sort_values("datetime", inplace=True)

start_time = df["datetime"].min()
df["days_since_start"] = (df["datetime"] - start_time).dt.total_seconds() / 86400.0

fig, ax_left = plt.subplots(figsize=(12, 6))
ax_right = ax_left.twinx()

ax_left.plot(df["days_since_start"], df["delta"], color='blue', label='Delta')
ax_left.set_xlabel("Days Since Start")
ax_left.set_ylabel("Delta", color='blue')
ax_left.tick_params(axis='y', labelcolor='blue')

ax_right.scatter(df["days_since_start"], df["apmag"], color='red', s=0.5, label='Apparent Mag')
ax_right.set_ylabel("Apparent Magnitude", color='red')
ax_right.tick_params(axis='y', labelcolor='red')

plt.title("Apparent Magnitude (Right Axis) and Delta (Left Axis) vs. Time")

plt.show()

CELL 8 TEST CASE

In [None]:
# Cell 8 (TEST CASE): The sun graph (Modified for dot markers for apparent magnitude)
'''

import requests
import pandas as pd
import matplotlib.pyplot as plt

# Define the Horizons API endpoint
url = "https://ssd.jpl.nasa.gov/api/horizons_file.api"

# Define the input content for Horizons API
input_content = """
!$$SOF
COMMAND='10'
CENTER='500@-48'
MAKE_EPHEM='YES'
EPHEM_TYPE='OBSERVER'
START_TIME='2015-01-19 00:00:00'
STOP_TIME='2019-07-21 23:59:00'
STEP_SIZE='60m'
QUANTITIES='9,20'
OUT_UNITS='KM-S'
CSV_FORMAT='NO'
!$$EOF
"""

# Define request parameters
params = {"format": "text"}

# Send request to Horizons API
print("=== Sending Request to Horizons API ===")
response = requests.post(url, data=params, files={'input': ('input.txt', input_content)})

# Check if request was successful
if response.status_code != 200:
    print(f"Failed Horizons request: {response.status_code}")
    exit()

# Read response text
response_text = response.text
lines = response_text.splitlines()

# Display first 1500 characters of the response for debugging
#print("=== Full Horizons API Response Preview ===")
#print(response_text[:1500])
#print("===========================================\n")

# Find start and end indices for data extraction
start_idx, end_idx = None, None
for i, line in enumerate(lines):
    if "$$SOE" in line:
        start_idx = i
    if "$$EOE" in line:
        end_idx = i
        break

# Initialize lists for extracted data
timestamps, apmag_vals, delta_vals = [], [], []

# Extract data if valid start and end indices found
if start_idx is not None and end_idx is not None:
#    print("=== Extracted Data Points ===")
    for line in lines[start_idx+1:end_idx]:
        line = line.strip()
        if not line:
            continue
        tokens = line.split()
        if len(tokens) >= 5:
            dt_str = tokens[0] + " " + tokens[1]
            try:
                dt = pd.to_datetime(dt_str)
                apmag = float(tokens[2])
                delta = float(tokens[4])
                timestamps.append(dt)
                apmag_vals.append(apmag)
                delta_vals.append(delta)
#                print(f"Timestamp: {dt}, Apparent Mag: {apmag}, Delta: {delta}")  # Print extracted values
            except Exception as e:
                print(f"Skipping line due to error: {e}")
#    print("========================================\n")
else:
    print("Error: Could not find valid $$SOE or $$EOE markers in response.")
    exit()

# Convert extracted data into a DataFrame
df = pd.DataFrame({
    "datetime": timestamps,
    "apmag": apmag_vals,
    "delta": delta_vals
})

# Sort values by datetime
df.sort_values("datetime", inplace=True)

# Compute days since start time
start_time = df["datetime"].min()
df["days_since_start"] = (df["datetime"] - start_time).dt.total_seconds() / 86400.0

# Display the first few extracted values for verification
print("=== List of Delta Values ===")
print(df["delta"].tolist())
print("================================\n")

print("=== List of Apparent Magnitude Values ===")
print(df["apmag"].tolist())
print("=========================================\n")

# Generate side-by-side scatter plots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Scatter plot for Delta
axes[0].scatter(df["days_since_start"], df["delta"], color='blue', s=1)
axes[0].set_xlabel("Days Since Start")
axes[0].set_ylabel("Delta")
axes[0].set_title("Delta vs. Time")

# Scatter plot for Apparent Magnitude
axes[1].scatter(df["days_since_start"], df["apmag"], color='red', s=1)
axes[1].set_xlabel("Days Since Start")
axes[1].set_ylabel("Apparent Magnitude")
axes[1].set_title("Apparent Magnitude vs. Time")

# Adjust layout for better viewing
plt.tight_layout()
plt.show()

'''

Cell 9 : 2. DELTA, Average Flux, and SB vs. Time Plots (Side by Side)

In [None]:
# Cell 9 : 2. DELTA, Average Flux, and SB vs. Time Plots (Side by Side)

delta_records, flux_records, sb_records = [], [], []

for file_num, f_name, s_brt, e_flux, dist_au, ef_err, f_avg, fa_err, d_obs, t_obs in ellipse_summaries:
    if d_obs != "N/A" and t_obs != "N/A":

        dt_parsed = pd.to_datetime(f"{d_obs} {t_obs}")
        delta_records.append((dt_parsed, dist_au))
        flux_records.append((dt_parsed, f_name, f_avg))
        sb_records.append((dt_parsed, s_brt))

df_delta = pd.DataFrame(delta_records, columns=["datetime", "delta_au"]).sort_values("datetime")
df_flux = pd.DataFrame(flux_records, columns=["datetime", "filename", "f_avg"]).sort_values("datetime")
df_sb = pd.DataFrame(sb_records, columns=["datetime", "s_brt"]).sort_values("datetime")

start_time = df_delta["datetime"].min()

df_delta["days_since_start"] = (df_delta["datetime"] - start_time).dt.total_seconds() / 86400.0
df_flux["days_since_start"]  = (df_flux["datetime"]  - start_time).dt.total_seconds() / 86400.0
df_sb["days_since_start"]    = (df_sb["datetime"]    - start_time).dt.total_seconds() / 86400.0

# -------------------- PLOT 1: side-by-side figures --------------------
fig, axs = plt.subplots(ncols=3, figsize=(18, 6))

axs[0].scatter(df_delta["days_since_start"], df_delta["delta_au"], color='black', alpha=0.75)
axs[0].set_xlabel("Days Since Start")
axs[0].set_ylabel("Delta (AU)")
axs[0].set_title("Distance (Delta) vs. Time")

axs[1].scatter(df_flux["days_since_start"], df_flux["f_avg"], color='blue', alpha=0.75)
axs[1].set_xlabel("Days Since Start")
axs[1].set_ylabel("Average Flux")
axs[1].set_title("Average Flux vs. Time")

axs[2].scatter(df_sb["days_since_start"], df_sb["s_brt"], color='green', alpha=0.75)
axs[2].set_xlabel("Days Since Start")
axs[2].set_ylabel("Surface Brightness")
axs[2].set_title("Surface Brightness vs. Time")

plt.tight_layout()
plt.show()

# -------------------- PLOT 2: Overlay of Delta & Average Flux vs. Time --------------------
fig, ax1 = plt.subplots(figsize=(12, 6))
ax1.scatter(df_delta["days_since_start"], df_delta["delta_au"], color='black', alpha=0.75)
ax1.set_xlabel("Days Since Start")
ax1.set_ylabel("Delta (AU)", color='black')
ax1.tick_params(axis='y', labelcolor='black')

ax1_right = ax1.twinx()
ax1_right.scatter(df_flux["days_since_start"], df_flux["f_avg"], color='blue', alpha=0.75)
ax1_right.set_ylabel("Average Flux", color='blue')
ax1_right.tick_params(axis='y', labelcolor='blue')

plt.title("Overlay: Delta and Average Flux vs. Time")
plt.show()

# -------------------- PLOT 3: Overlay of Delta & Surface Brightness vs. Time --------------------
fig, ax2 = plt.subplots(figsize=(12, 6))
ax2.scatter(df_delta["days_since_start"], df_delta["delta_au"], color='black', alpha=0.75)
ax2.set_xlabel("Days Since Start")
ax2.set_ylabel("Delta (AU)", color='black')
ax2.tick_params(axis='y', labelcolor='black')

ax2_right = ax2.twinx()
ax2_right.scatter(df_sb["days_since_start"], df_sb["s_brt"], color='green', alpha=0.75)
ax2_right.set_ylabel("Surface Brightness", color='green')
ax2_right.tick_params(axis='y', labelcolor='green')

plt.title("Overlay: Delta and Surface Brightness vs. Time")
plt.show()

# -------------------- PLOT 4: Overlay of Average Flux & Surface Brightness vs. Time --------------------
fig, ax3 = plt.subplots(figsize=(12, 6))
ax3.scatter(df_flux["days_since_start"], df_flux["f_avg"], color='blue', alpha=0.75)
ax3.set_xlabel("Days Since Start")
ax3.set_ylabel("Average Flux", color='blue')
ax3.tick_params(axis='y', labelcolor='blue')

ax3_right = ax3.twinx()
ax3_right.scatter(df_sb["days_since_start"], df_sb["s_brt"], color='green', alpha=0.75)
ax3_right.set_ylabel("Surface Brightness", color='green')
ax3_right.tick_params(axis='y', labelcolor='green')

plt.title("Overlay: Average Flux and Surface Brightness vs. Time")
plt.show()


CELL 10 - overlay to see all the plots on a single graph

In [None]:
# Cell 10 : quadruple overlay

from mpl_toolkits.axes_grid1 import host_subplot
import mpl_toolkits.axisartist as AA
import numpy as np
import matplotlib.pyplot as plt

# Create figure and host subplot
fig = plt.figure(figsize=(14, 8))
host = host_subplot(111, axes_class=AA.Axes)
plt.subplots_adjust(right=0.75)

# Create parasite axes
par1, par2, par3, par4 = host.twinx(), host.twinx(), host.twinx(), host.twinx()

# Offset the parasite axes
offset = 60
for par in [par2, par3, par4]:
    new_fixed_axis = par.get_grid_helper().new_fixed_axis
    par.axis["right"] = new_fixed_axis(loc="right", axes=par, offset=(offset, 0))
    offset += 60

# Plot data
p1, = host.plot(df["days_since_start"], df["delta"], color='blue', label='Sun Delta')
p2 = par1.scatter(df["days_since_start"], df["apmag"], color='red', s=1, label='Sun Apparent Mag')
p3 = par2.scatter(df_delta["days_since_start"], df_delta["delta_au"], color='black', alpha=0.75, label='Jupiter Delta')
p4 = par3.scatter(df_flux["days_since_start"], df_flux["f_avg"], color='blue', alpha=0.75, label='Jupiter Avg Flux')
p5 = par4.scatter(df_sb["days_since_start"], df_sb["s_brt"], color='green', alpha=0.75, label='Jupiter SB')

# Axis labels
host.set_xlabel("Days Since Start")
host.set_ylabel("Sun Delta")
par1.set_ylabel("Sun Apparent Mag")
par2.set_ylabel("Jupiter Delta")
par3.set_ylabel("Jupiter Avg Flux")
par4.set_ylabel("Jupiter Surface Brightness")

# Color labels
host.axis["left"].label.set_color('blue')
par1.axis["right"].label.set_color('red')
par2.axis["right"].label.set_color('black')
par3.axis["right"].label.set_color('blue')
par4.axis["right"].label.set_color('green')

# X-axis tick intervals
max_days = max(df["days_since_start"].max(), df_delta["days_since_start"].max(), df_flux["days_since_start"].max(), df_sb["days_since_start"].max())
host.set_xticks(np.arange(0, max_days + 100, 100))

# Plot title
plt.title("Combined Quadruple Axis Plot (5 Y-Axes) vs. Time")
plt.draw()
plt.show()


CELL 11A - a series of subplots for Average flux vs time with the respective images to go along with the data points<br>
CELL 11B - a series of subplots for delta vs time with the respective images to go along with the data points<br>
CELL 11C - a series of subplots for Surface Brightness vs time with the respective images to go along with the data points

TEST CASE BEING CREATED FOR CELL 11

In [None]:
# ====================== GRAND CELL: 11A, 11B, and 11C ======================
# This single cell:
#   1) Forces each of df_flux, df_delta, df_sb to have "filename" columns
#   2) Then runs DBSCAN + plotting code for 11A, 11B, 11C in order
#   3) Displays images automatically if the new "filename" columns match cropped_images

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.cluster import DBSCAN

# ADDED IMPORT to control tick formatting
import matplotlib.ticker as mticker

###############################################################################
#          GLOBAL ADJUSTABLE SIZES (POINTS, ANNOTATIONS, FILE NUMBERS)
###############################################################################
# Feel free to change these values as needed:
POINT_SIZE = 50                # Size of the scatter data points
ANNOTATION_FONTSIZE = 16        # Font size for the numbers that appear next to points
FILE_NUMBER_FONTSIZE = 16       # Font size for "File #XX" titles on each image

###############################################################################
#                  GLOBAL FONT AND SIZE ADJUSTMENTS FOR PLOTS
###############################################################################
# By placing these at the very end, all future figures will automatically
# use these settings. You can tweak them all in one place.
plt.rcParams.update({
    "font.size": 16,                   # Base font size for everything
    "axes.labelsize": 16,              # Axis label font size
    "axes.titlesize": 18,              # Title font size
    "legend.fontsize": 12,             # Legend font size
    "xtick.labelsize": 16,             # X tick label size
    "ytick.labelsize": 16,             # Y tick label size
    "axes.formatter.useoffset": False  # Disable offset notation globally
})

###############################################################################
#             STEP 0: FORCE 'filename' COLUMN IN DF_DELTA AND DF_SB
###############################################################################
# (We assume df_flux ALREADY has 'filename'. If not, adapt as needed.)
# We'll merge df_delta and df_sb on 'datetime' to bring over 'filename' from df_flux.

print("=== MERGING to ensure each DataFrame has a 'filename' column ===\n")

# Make copies so we don't mutate the originals
df_flux_merged  = df_flux.copy()
df_delta_merged = df_delta.copy()
df_sb_merged    = df_sb.copy()

# Ensure df_flux_merged definitely has 'datetime' and 'filename'
if "filename" not in df_flux_merged.columns:
    raise ValueError("df_flux must have a 'filename' column so we can propagate it to df_delta & df_sb.")

if "datetime" not in df_flux_merged.columns:
    raise ValueError("df_flux must have a 'datetime' column so we can merge on it.")

# Merge for df_delta_merged
if "datetime" not in df_delta_merged.columns:
    raise ValueError("df_delta must have a 'datetime' column for merging.")
df_delta_merged = pd.merge(
    df_delta_merged,
    df_flux_merged[["datetime", "filename"]],
    on="datetime",
    how="left"
)

# Merge for df_sb_merged
if "datetime" not in df_sb_merged.columns:
    raise ValueError("df_sb must have a 'datetime' column for merging.")
df_sb_merged = pd.merge(
    df_sb_merged,
    df_flux_merged[["datetime", "filename"]],
    on="datetime",
    how="left"
)

print("Merged 'filename' into df_delta_merged and df_sb_merged.\n")
print("=== Checking the first 3 rows of each merged DataFrame ===")
print("\n--> df_flux_merged:")
print(df_flux_merged.head(3))
print("\n--> df_delta_merged:")
print(df_delta_merged.head(3))
print("\n--> df_sb_merged:")
print(df_sb_merged.head(3))
print("\n\n")


In [None]:
###############################################################################
#                             CELL 11A
###############################################################################

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.cluster import DBSCAN
import matplotlib.ticker as mticker # Ensure this is imported

###############################################################################
#          GLOBAL ADJUSTABLE SIZES (POINTS, ANNOTATIONS, FILE NUMBERS)
###############################################################################
POINT_SIZE = 50                # Size of the scatter data points
ANNOTATION_FONTSIZE = 16       # Font size for the numbers next to scatter points
# FILE_NUMBER_FONTSIZE = 16    # Font size for "File #XX" titles (REMOVED)
IMAGE_NUMBER_FONTSIZE = 16     # Font size for the number overlay ON the image
IMAGE_NUMBER_X_OFFSET = 0      # Pixel offset from left edge for image number
IMAGE_NUMBER_Y_OFFSET = 100      # Pixel offset from top edge for image number

###############################################################################
#                  GLOBAL FONT AND SIZE ADJUSTMENTS FOR PLOTS
###############################################################################
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 16,
    "axes.titlesize": 18,
    "legend.fontsize": 12,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "axes.formatter.useoffset": False
})

###############################################################################
#                             MODIFIED FUNCTION
###############################################################################

def run_dbscan_and_plot_with_images(
    df_flux,
    eps=0.25,
    min_samples=3,
    margin_scale=0.1,
    use_merge_plots=True,
    merges=None,
    cropped_images=None
):
    """
    Runs DBSCAN clustering on time-series data and plots clusters along with
    corresponding cropped images. Image numbers are overlaid on the images.

    Args:
        df_flux (pd.DataFrame): DataFrame containing 'datetime', 'f_avg',
                                'days_since_start', and optionally 'filename'.
        eps (float): The maximum distance between two samples for one to be
                     considered as in the neighborhood of the other (DBSCAN).
        min_samples (int): The number of samples in a neighborhood for a point
                           to be considered as a core point (DBSCAN).
        margin_scale (float): Factor to extend plot limits beyond data range.
        use_merge_plots (bool): If True, group clusters specified in 'merges'.
        merges (list[list[int]], optional): List of lists, where each inner list
                                            contains cluster labels to merge.
        cropped_images (list[tuple], optional): List of tuples like
                                (index, filename, date_obs, time_obs, image_array).
                                Needed to display images.
    """
    print("\n==========================================")
    print("    DBSCAN + Image Plotting (Unified)")
    print("==========================================\n")

    required_cols = ["datetime", "f_avg", "days_since_start"]
    for c in required_cols:
        if c not in df_flux.columns:
            print(f"ERROR: Missing column '{c}' in df_flux! Cannot proceed.")
            return

    print("STEP 1: Checking df_flux sample (first 5 rows):")
    print(df_flux.head(5))

    print("\nSTEP 2: Checking for NaN in 'f_avg' or 'days_since_start':")
    nan_count_favg = df_flux["f_avg"].isna().sum()
    nan_count_days = df_flux["days_since_start"].isna().sum()
    if nan_count_favg > 0 or nan_count_days > 0:
        print(f"WARNING: Found {nan_count_favg} NaN in 'f_avg' and {nan_count_days} in 'days_since_start'.")
    else:
        print("No NaN values found.")

    print("\nSTEP 3: Range check for 'days_since_start':")
    days_min = df_flux["days_since_start"].min()
    days_max = df_flux["days_since_start"].max()
    print(f"days_since_start => min: {days_min:.3f}, max: {days_max:.3f}")
    # Optional checks for time range validity can remain here

    # Make copy and add index number
    df_flux = df_flux.copy()
    df_flux["index_number"] = np.arange(len(df_flux)) + 1

    print(f"\nSTEP 4: Running DBSCAN (eps={eps}, min_samples={min_samples})...")
    X = df_flux["days_since_start"].values.reshape(-1, 1)
    try:
        db = DBSCAN(eps=eps, min_samples=min_samples)
        cluster_labels = db.fit_predict(X)
    except Exception as e:
        print(f"ERROR: DBSCAN failed: {e}")
        return

    df_flux["cluster"] = cluster_labels
    # Shift valid cluster labels to start from 1 (0 becomes 1, 1 becomes 2, etc.)
    df_flux.loc[df_flux["cluster"] >= 0, "cluster"] = df_flux["cluster"] + 1

    clusters = df_flux[df_flux["cluster"] != -1]["cluster"].unique()
    clusters.sort()

    print("DBSCAN completed.")
    print(f"Total data points: {len(df_flux)}")
    n_noise = sum(cluster_labels == -1)
    print(f"Noise points: {n_noise}")
    print(f"Valid cluster labels (excluding -1, shifted +1): {clusters}")

    print("\nSTEP 5: Summarizing clusters:")
    if len(clusters) == 0:
        print("No valid clusters (all noise?).")
        return

    for label in clusters:
        cdata = df_flux[df_flux["cluster"] == label]
        cmin = cdata["days_since_start"].min()
        cmax = cdata["days_since_start"].max()
        csize = len(cdata)
        print(f"  Cluster {label}: size={csize}, time-range=({cmin:.2f}, {cmax:.2f})")

    # Prepare items to plot (handle merges)
    if merges is None:
        merges = []
    merges = [set(m) for m in merges]
    plot_items = []
    used = set()
    for lbl in clusters:
        if lbl in used:
            continue
        belongs_to = None
        for mset in merges:
            if lbl in mset:
                belongs_to = mset
                break
        if belongs_to is not None:
            plot_items.append(belongs_to)
            used.update(belongs_to)
        else:
            plot_items.append({lbl})
            used.add(lbl)

    print("\nSTEP 6: Creating cluster subplots + images in one figure...")

    filename_col_present = "filename" in df_flux.columns
    if not filename_col_present:
        print("NOTE: 'filename' column not in df_flux. Will skip image matching.")

    have_cropped_images = (cropped_images is not None) and (len(cropped_images) > 0)
    if not have_cropped_images:
        print("NOTE: 'cropped_images' is None or empty. Will skip image display.\n")

    # --- Plotting Loop ---
    for item in plot_items:
        label_list = sorted(item)
        sub_df = df_flux[df_flux["cluster"].isin(label_list)].copy()
        if len(sub_df) == 0:
            print(f"No data for clusters {label_list}, skipping.")
            continue

        snippet_data = sub_df.head(5)
        print(f"\n   => Plotting cluster(s) {label_list}, total points: {len(sub_df)}")
        print("   First 5 lines (with index_number):")
        print(snippet_data[["index_number","days_since_start","f_avg"]].to_string(index=False))

        # Find matching images for this cluster/merge group
        matching_images = []
        if filename_col_present and have_cropped_images:
            valid_filenames = set(sub_df["filename"].unique())
            matching_images = [
                (fidx,fname,dobs,tobs,img_8u)
                for (fidx,fname,dobs,tobs,img_8u) in cropped_images
                if fname in valid_filenames
            ]
            # Sort matching images by file index to maintain order
            matching_images.sort(key=lambda x: x[0])


        # Setup figure grid
        n_img = len(matching_images)
        img_cols = 8 # Max images per row
        img_rows = math.ceil(n_img / img_cols) if n_img > 0 else 0
        fig_width = 25
        # Adjust height: 6 for main plot + 3 per image row
        fig_height = 6 + (3 * img_rows)

        fig = plt.figure(figsize=(fig_width, fig_height))
        # GridSpec: 1 row for scatter plot, 'img_rows' for images
        gs = GridSpec(nrows=img_rows + 1, ncols=img_cols,
                      height_ratios=[6] + ([3] * img_rows) if img_rows > 0 else [6], # Handle case with 0 image rows
                      figure=fig)

        # --- Scatter Plot ---
        ax_scatter = fig.add_subplot(gs[0, :]) # Scatter plot spans all columns in the first row
        colors = plt.cm.tab10(np.linspace(0, 1, len(label_list))) # Colors for different clusters if merged

        for i, lbl2 in enumerate(label_list):
            cdata = sub_df[sub_df["cluster"] == lbl2]
            ax_scatter.scatter(
                cdata["days_since_start"],
                cdata["f_avg"],
                color=colors[i],
                alpha=0.75,
                s=POINT_SIZE,
                label=f"Cluster {lbl2}"
            )
            # Annotate points with their original index number
            for _, row in cdata.iterrows():
                ax_scatter.text(
                    row["days_since_start"],
                    row["f_avg"],
                    str(row["index_number"]),
                    fontsize=ANNOTATION_FONTSIZE,
                    ha='left',
                    va='bottom'
                )

        # Set title based on whether clusters were merged
        if len(label_list) > 1:
            ax_scatter.set_title(f"Merged Clusters {label_list}", fontsize=14)
            ax_scatter.legend() # Show legend only for merged plots
        else:
            ax_scatter.set_title(f"Cluster {label_list[0]}", fontsize=14)

        # Set plot limits with margins
        t_min = sub_df["days_since_start"].min()
        t_max = sub_df["days_since_start"].max()
        span = t_max - t_min
        if span <= 0: # Handle single point case
            t_min -= 0.1; t_max += 0.1
        else:
            margin = margin_scale * span
            t_min -= margin; t_max += margin
        ax_scatter.set_xlim(t_min, t_max)

        favg_min = sub_df["f_avg"].min()
        favg_max = sub_df["f_avg"].max()
        y_span = favg_max - favg_min
        if y_span <= 0: # Handle single point case
            favg_min -= 0.5; favg_max += 0.5
        else:
            y_margin = margin_scale * y_span
            favg_min -= y_margin; favg_max += y_margin
        ax_scatter.set_ylim(favg_min, favg_max)

        # Labels and ticks
        ax_scatter.set_xlabel("Days Since Start")
        ax_scatter.set_ylabel("Average Flux")
        ax_scatter.xaxis.set_major_locator(mticker.MultipleLocator(0.1)) # Adjust tick frequency if needed
        ax_scatter.ticklabel_format(style='plain', useOffset=False, axis='x')
        ax_scatter.ticklabel_format(style='plain', useOffset=False, axis='y')
        ax_scatter.yaxis.set_major_locator(mticker.MaxNLocator(nbins='auto')) # Auto Y ticks

        # --- Image Subplots ---
        if n_img > 0:
            print(f"   => Plotting {n_img} associated image(s)...")
            for i_img, (fidx2, fname2, dobs2, tobs2, img_8u) in enumerate(matching_images):
                row_i = 1 + (i_img // img_cols) # Start from the second row of GridSpec
                col_i = i_img % img_cols
                ax_img = fig.add_subplot(gs[row_i, col_i])
                ax_img.imshow(img_8u, cmap='gray', origin='lower')

                # --- MODIFICATION START ---
                # Remove the title
                # ax_img.set_title(f"File #{fidx2}", fontsize=FILE_NUMBER_FONTSIZE)

                # Add text overlay inside the image
                ax_img.text(
                    IMAGE_NUMBER_X_OFFSET,  # X position (pixels from left)
                    IMAGE_NUMBER_Y_OFFSET,  # Y position (pixels from top)
                    f"{fidx2}",            # Text to display
                    color='white',          # Text color
                    fontsize=IMAGE_NUMBER_FONTSIZE,
                    ha='left',              # Horizontal alignment
                    va='top'               # Vertical alignment
                    # Optional: Add a background box for contrast
                    #bbox=dict(facecolor='black', alpha=0.5, pad=0.3, boxstyle='round,pad=0.3')
                )
                # --- MODIFICATION END ---

                ax_img.axis("off") # Hide axes for the image subplot
        else:
             print("   => No matching images found or provided for this cluster group.")


        plt.tight_layout() # Adjust spacing
        plt.show() # Display the combined figure

        # Optional: Print list of files shown
        if filename_col_present and have_cropped_images and n_img > 0:
            print(f"   => Images shown correspond to files:")
            for (xxid, xxfn, ddt, ddt2, _) in matching_images:
                print(f"       * File #{xxid} => {xxfn} => {ddt} {ddt2}")

        print("   ----------------------------------------------------\n")

    print("===== End of DBSCAN + Image Plotting Process (11A) =====\n")

# Example Call (assuming df_flux_merged and cropped_images are defined)
run_dbscan_and_plot_with_images(
    df_flux_merged,
    eps=0.5,
    min_samples=1,
    merges=[[11,12,13]], # Example merge
    cropped_images=cropped_images # Pass your list of cropped image tuples
)
print("=== END CELL 11A ===\n\n\n")

In [None]:
###############################################################################
#                       CELL 11A (with Sine Wave Overlay)
###############################################################################

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.cluster import DBSCAN
import matplotlib.ticker as mticker # Ensure this is imported
from scipy.optimize import curve_fit # <-- ADDED IMPORT FOR SINE FIT

###############################################################################
#          GLOBAL ADJUSTABLE SIZES (POINTS, ANNOTATIONS, FILE NUMBERS)
###############################################################################
POINT_SIZE = 50                # Size of the scatter data points
ANNOTATION_FONTSIZE = 16       # Font size for the numbers next to scatter points
# FILE_NUMBER_FONTSIZE = 16    # Font size for "File #XX" titles (REMOVED)
IMAGE_NUMBER_FONTSIZE = 16     # Font size for the number overlay ON the image
IMAGE_NUMBER_X_OFFSET = 5      # Pixel offset from left edge for image number (Adjusted slightly)
IMAGE_NUMBER_Y_OFFSET = 15     # Pixel offset from top edge for image number (Adjusted from bottom)

###############################################################################
#                  GLOBAL FONT AND SIZE ADJUSTMENTS FOR PLOTS
###############################################################################
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 16,
    "axes.titlesize": 18,
    "legend.fontsize": 12,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "axes.formatter.useoffset": False
})

###############################################################################
#                           SINE WAVE FUNCTION (ADDED)
###############################################################################

def sine_func(x, amplitude, frequency, phase_shift, vertical_offset):
    """Mathematical model for a sine wave."""
    # Ensure x is a numpy array for vectorized operations
    x = np.asarray(x)
    # Protect against invalid inputs if necessary (though curve_fit usually handles)
    # For example, ensure amplitude is non-negative if expected
    # amplitude = max(0, amplitude) # Optional constraint, better handled by bounds
    return amplitude * np.sin(frequency * x + phase_shift) + vertical_offset

###############################################################################
#                            MODIFIED FUNCTION
###############################################################################

def run_dbscan_and_plot_with_images(
    df_flux,
    eps=0.25,
    min_samples=3,
    margin_scale=0.1,
    use_merge_plots=True, # Kept for compatibility, but merging logic relies on 'merges'
    merges=None,
    cropped_images=None
):
    """
    Runs DBSCAN clustering on time-series data and plots clusters along with
    corresponding cropped images and an overlaid sine wave fit on the scatter plot.
    Image numbers are overlaid on the images.

    Args:
        df_flux (pd.DataFrame): DataFrame containing 'datetime', 'f_avg',
                                'days_since_start', and optionally 'filename'.
        eps (float): The maximum distance between two samples for one to be
                     considered as in the neighborhood of the other (DBSCAN).
        min_samples (int): The number of samples in a neighborhood for a point
                           to be considered as a core point (DBSCAN).
        margin_scale (float): Factor to extend plot limits beyond data range.
        use_merge_plots (bool): Kept for compatibility, but merging logic now
                                solely relies on the 'merges' argument.
        merges (list[list[int]], optional): List of lists, where each inner list
                                           contains cluster labels to merge.
        cropped_images (list[tuple], optional): List of tuples like
                                (index, filename, date_obs, time_obs, image_array).
                                Needed to display images.
    """
    print("\n=======================================================")
    print("   DBSCAN + Image Plotting + Sine Fit (Unified)")
    print("=======================================================\n")

    required_cols = ["datetime", "f_avg", "days_since_start"]
    for c in required_cols:
        if c not in df_flux.columns:
            print(f"ERROR: Missing column '{c}' in df_flux! Cannot proceed.")
            return

    # --- Data Validation (Added checks for NaN/Inf) ---
    print("STEP 1: Checking df_flux sample (first 5 rows):")
    print(df_flux.head(5))

    print("\nSTEP 2: Checking for NaN/Inf in 'f_avg' or 'days_since_start':")
    nan_inf_favg = df_flux["f_avg"].isna().sum() + np.isinf(df_flux["f_avg"]).sum()
    nan_inf_days = df_flux["days_since_start"].isna().sum() + np.isinf(df_flux["days_since_start"]).sum()

    if nan_inf_favg > 0 or nan_inf_days > 0:
        print(f"WARNING: Found {nan_inf_favg} NaN/Inf in 'f_avg' and {nan_inf_days} in 'days_since_start'.")
        print("Attempting to remove rows with NaN/Inf values...")
        original_len = len(df_flux)
        df_flux = df_flux.replace([np.inf, -np.inf], np.nan).dropna(subset=['f_avg', 'days_since_start'])
        print(f"Removed {original_len - len(df_flux)} rows.")
        if len(df_flux) == 0:
            print("ERROR: No valid data remaining after removing NaN/Inf. Cannot proceed.")
            return
    else:
        print("No NaN or Inf values found.")

    print("\nSTEP 3: Range check for 'days_since_start':")
    # Ensure data types are numeric before min/max
    df_flux["days_since_start"] = pd.to_numeric(df_flux["days_since_start"], errors='coerce')
    df_flux["f_avg"] = pd.to_numeric(df_flux["f_avg"], errors='coerce')
    df_flux = df_flux.dropna(subset=['days_since_start', 'f_avg']) # Drop if conversion failed

    if len(df_flux) == 0:
         print("ERROR: No numeric data remaining after type conversion. Cannot proceed.")
         return

    days_min = df_flux["days_since_start"].min()
    days_max = df_flux["days_since_start"].max()
    print(f"days_since_start => min: {days_min:.3f}, max: {days_max:.3f}")

    # --- DBSCAN ---
    # Make copy and add index number *after* cleaning
    df_flux = df_flux.copy()
    # Use reset_index to get a reliable sequential index if rows were dropped
    df_flux = df_flux.reset_index(drop=True)
    df_flux["index_number"] = df_flux.index + 1 # Sequential index for valid data

    print(f"\nSTEP 4: Running DBSCAN (eps={eps}, min_samples={min_samples})...")
    X = df_flux["days_since_start"].values.reshape(-1, 1)
    try:
        db = DBSCAN(eps=eps, min_samples=min_samples)
        cluster_labels = db.fit_predict(X)
    except Exception as e:
        print(f"ERROR: DBSCAN failed: {e}")
        return

    df_flux["cluster"] = cluster_labels
    # Shift valid cluster labels to start from 1 (0 becomes 1, 1 becomes 2, etc.)
    df_flux.loc[df_flux["cluster"] >= 0, "cluster"] = df_flux["cluster"] + 1

    clusters = df_flux[df_flux["cluster"] != -1]["cluster"].unique()
    clusters.sort()

    print("DBSCAN completed.")
    print(f"Total valid data points analyzed: {len(df_flux)}")
    n_noise = sum(cluster_labels == -1)
    print(f"Noise points (-1): {n_noise}")
    print(f"Valid cluster labels found (shifted +1): {clusters}")

    print("\nSTEP 5: Summarizing clusters:")
    if len(clusters) == 0 and n_noise == 0: # Check if there's no data at all
        print("No clusters or noise points found.")
        return
    elif len(clusters) == 0:
         print("No valid clusters found (only noise or too few points?).")
         # Continue if noise points exist

    cluster_summary = {}
    for label in clusters:
        cdata = df_flux[df_flux["cluster"] == label]
        cmin = cdata["days_since_start"].min()
        cmax = cdata["days_since_start"].max()
        csize = len(cdata)
        cluster_summary[label] = {'size': csize, 'min_t': cmin, 'max_t': cmax}
        print(f"   Cluster {label}: size={csize}, time-range=({cmin:.2f}, {cmax:.2f})")

    # --- Plotting Preparation ---
    if merges is None:
        merges = []
    merges = [set(m) for m in merges] # Ensure elements are sets for easier checking
    plot_items = []
    used_clusters = set()

    # Add merged clusters first
    for mset in merges:
        valid_merge_clusters = {lbl for lbl in mset if lbl in cluster_summary} # Only include clusters that actually exist
        if valid_merge_clusters:
            plot_items.append(valid_merge_clusters)
            used_clusters.update(valid_merge_clusters)

    # Add remaining individual clusters
    for lbl in clusters:
        if lbl not in used_clusters:
            plot_items.append({lbl}) # Store single clusters as sets too
            used_clusters.add(lbl)

    # Add noise points as a separate item if they exist
    if n_noise > 0:
        plot_items.append({-1}) # Use -1 to represent noise

    print("\nSTEP 6: Creating plots for clusters/merges/noise...")

    filename_col_present = "filename" in df_flux.columns
    if not filename_col_present:
        print("NOTE: 'filename' column not in df_flux. Will skip image matching.")

    have_cropped_images = (cropped_images is not None) and (len(cropped_images) > 0)
    if not have_cropped_images:
        print("NOTE: 'cropped_images' is None or empty. Will skip image display.\\n")
    elif not filename_col_present:
         print("NOTE: Cannot display images without 'filename' column in DataFrame.\\n")


    # --- Plotting Loop ---
    for item_set in plot_items:
        is_noise_plot = -1 in item_set
        if is_noise_plot:
             label_list = [-1]
             sub_df = df_flux[df_flux["cluster"] == -1].copy()
             plot_title = "Noise Points"
             is_merged = False # Noise is not considered merged
        else:
            label_list = sorted(list(item_set)) # Convert set back to sorted list
            sub_df = df_flux[df_flux["cluster"].isin(label_list)].copy()
            is_merged = len(label_list) > 1
            if is_merged:
                 plot_title = f"Merged Clusters {label_list}"
            else:
                 plot_title = f"Cluster {label_list[0]}"


        if len(sub_df) == 0:
            print(f"No data for {'noise' if is_noise_plot else f'cluster(s) {label_list}'}, skipping plot.")
            continue

        # Sort sub_df by time for potentially better sine fitting and consistent plotting
        sub_df = sub_df.sort_values(by="days_since_start")

        print(f"\n   => Plotting: {plot_title}, total points: {len(sub_df)}")
        # print("   First 5 lines (sorted by time, with index_number):")
        # print(sub_df[["index_number","days_since_start","f_avg"]].head(5).to_string(index=False))

        # Find matching images (only if not noise and requirements met)
        matching_images = []
        if not is_noise_plot and filename_col_present and have_cropped_images:
            valid_filenames = set(sub_df["filename"].unique())
            # Create a mapping from filename to original index_number for correct sorting
            # This assumes 'filename' uniquely maps to an 'index_number' in the *cleaned* df_flux
            fname_to_index = sub_df.set_index('filename')['index_number'].to_dict()

            temp_matching = []
            # The `cropped_images` list uses the *original* file index (fidx from Cell 5 enumerate)
            # We need to map this back to the `index_number` used in the plotting dataframe
            # Let's rebuild `fname_to_index` using the original `file_details` if available
            # or just map filename to the *current* index_number if `file_details` isn't passed
            # For simplicity here, we assume `fname_to_index` based on current `sub_df` is sufficient
            # for retrieving images associated with points *in this cluster*.

            # We need to link `cropped_images` (which has original fidx) to `sub_df` (which has index_number)
            # Best way is via filename if it's guaranteed unique
            sub_df_filenames = set(sub_df['filename'])
            for (fidx_orig, fname, dobs, tobs, img_8u) in cropped_images:
                 if fname in sub_df_filenames:
                      # Find the corresponding index_number in sub_df for this filename
                      matched_rows = sub_df[sub_df['filename'] == fname]
                      if not matched_rows.empty:
                           current_index_num = matched_rows['index_number'].iloc[0] # Get index_number for sorting plot items
                           # Store (current_index_num_for_sorting, original_fidx_for_display, fname, dobs, tobs, img_8u)
                           temp_matching.append((current_index_num, fidx_orig, fname, dobs, tobs, img_8u))

            # Sort based on the current index_number from the dataframe to match plot order
            temp_matching.sort(key=lambda x: x[0])
            # Final list in correct format (original_fidx, fname, dobs, tobs, img_8u)
            matching_images = [(fidx_orig, fname, dobs, tobs, img_8u) for _, fidx_orig, fname, dobs, tobs, img_8u in temp_matching]


        # Setup figure grid
        n_img = len(matching_images)
        img_cols = 8 # Max images per row
        img_rows = math.ceil(n_img / img_cols) if n_img > 0 else 0
        fig_width = 25
        fig_height = 6 + (3 * img_rows) # Adjust height: 6 for main plot + 3 per image row

        fig = plt.figure(figsize=(fig_width, fig_height))
        # GridSpec: 1 row for scatter plot, 'img_rows' for images
        gs_rows = 1 + img_rows
        height_ratios = [6] + ([3] * img_rows) if img_rows > 0 else [6]
        gs = GridSpec(nrows=gs_rows, ncols=img_cols,
                      height_ratios=height_ratios,
                      figure=fig)

        # --- Scatter Plot ---
        ax_scatter = fig.add_subplot(gs[0, :]) # Scatter plot spans all columns in the first row

        sine_fit_successful = False # Reset for each plot
        show_legend = False # Determine if legend should be shown

        if is_noise_plot:
             # Plot noise points
             ax_scatter.scatter(
                 sub_df["days_since_start"], sub_df["f_avg"],
                 color='gray', alpha=0.5, s=POINT_SIZE * 0.8, label="Noise"
             )
             show_legend = True # Show legend for noise plot
             # Annotate noise points
             for _, row in sub_df.iterrows():
                 ax_scatter.text(
                        row["days_since_start"], row["f_avg"], str(row["index_number"]),
                        fontsize=ANNOTATION_FONTSIZE * 0.8, color='gray', ha='left', va='bottom')
        else:
            # Plot valid cluster points
            colors = plt.cm.viridis(np.linspace(0, 0.9, max(1, len(label_list)))) # Use viridis, avoid yellow end

            for i_lbl, lbl2 in enumerate(label_list):
                cdata = sub_df[sub_df["cluster"] == lbl2]
                if not cdata.empty:
                    ax_scatter.scatter(
                        cdata["days_since_start"], cdata["f_avg"],
                        color=colors[i_lbl % len(colors)], alpha=0.8, s=POINT_SIZE,
                        label=f"Cluster {lbl2}" # Label each cluster
                    )
                    show_legend = True # Show legend if plotting clusters
                    # Annotate points
                    for _, row in cdata.iterrows():
                        ax_scatter.text(
                            row["days_since_start"], row["f_avg"], str(row["index_number"]),
                            fontsize=ANNOTATION_FONTSIZE, ha='left', va='bottom')

            # --- ADDED: Sine Wave Fitting and Plotting (only for non-noise) ---
            if len(sub_df) >= 4: # Need at least 4 points to fit 4 parameters
                x_data = sub_df["days_since_start"].values
                y_data = sub_df["f_avg"].values
                x_span = x_data.max() - x_data.min() if len(x_data) > 1 else 0
                y_span = y_data.max() - y_data.min() if len(y_data) > 1 else 0

                # Avoid fitting if points are too close, constant, or span is too small
                if x_span > 1e-6 and y_span > 1e-6:
                    print(f"   => Attempting sine fit for {plot_title}...")
                    try:
                        # Initial guesses - Refined
                        initial_amplitude = y_span / 2.0
                        initial_vertical_offset = np.median(y_data) # Median is robust to outliers
                        # Frequency guess: More robust - check if span is large enough for a cycle
                        # Assume at least half a cycle might be present if span > 0.1 days? Adjust as needed.
                        initial_frequency = (np.pi / x_span) if x_span > 0.1 else 1.0
                        initial_phase_shift = 0 # Keep phase guess simple

                        initial_guesses = [initial_amplitude, initial_frequency, initial_phase_shift, initial_vertical_offset]
                        # Bounds to help guide the fit and prevent nonsensical results
                        bounds = (
                            [0, 0, -np.inf, -np.inf], # Lower bounds: Amplitude >= 0, Frequency >= 0
                            [y_span * 1.5, np.inf, np.inf, np.inf] # Upper bounds: Limit amplitude guess reasonably
                        )

                        popt, pcov = curve_fit(sine_func, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=5000)

                        # Check quality of fit using covariance: Check diagonal elements
                        if np.any(np.isinf(np.diag(pcov))):
                             print(f"   WARNING: Sine fit converged for {plot_title}, but covariance indicates high uncertainty. Fit may be unreliable.")
                             # Decide whether to plot uncertain fits or not - here we'll plot but warn
                             pass # Continue to plot

                        # Generate points for the fitted curve
                        x_fit = np.linspace(x_data.min(), x_data.max(), 200) # Smooth curve over data range
                        y_fit = sine_func(x_fit, *popt)

                        # Plot the fitted sine wave
                        ax_scatter.plot(x_fit, y_fit, color='red', linestyle='--', linewidth=2, label='Sine Fit')
                        print(f"   Sine fit successful. Params (A, ω, φ, D): {np.round(popt, 3)}")
                        sine_fit_successful = True
                        show_legend = True # Ensure legend is shown if fit is added

                    except RuntimeError as e:
                        print(f"   WARNING: Sine fit failed for {plot_title}. Reason: Optimal parameters not found. ({e})")
                    except ValueError as e:
                         print(f"   WARNING: Sine fit failed for {plot_title}. Reason: {e}") # Often due to bounds/input data mismatch
                    except Exception as e:
                        print(f"   WARNING: An unexpected error occurred during sine fit for {plot_title}: {e}")
                else:
                    print(f"   => Skipping sine fit for {plot_title}: Data points have minimal span in X ({x_span:.2e}) or Y ({y_span:.2e}).")
            else:
                print(f"   => Skipping sine fit for {plot_title}: Not enough data points (need >= 4). Found {len(sub_df)}.")
            # --- END ADDED SINE FIT SECTION ---

        # --- Final Scatter Plot Formatting ---
        ax_scatter.set_title(plot_title, fontsize=18)

        # --- MODIFIED LEGEND DISPLAY ---
        if show_legend: # Show legend if noise, merged clusters, or successful fit
             # Place legend outside plot area to avoid overlap, adjust position slightly if needed
             ax_scatter.legend(loc='upper left', bbox_to_anchor=(1.02, 1), borderaxespad=0.1)
        # --- END MODIFIED LEGEND ---

        # Set plot limits with margins (DO NOT CHANGE THIS PART's LOGIC for limits)
        t_min_data = sub_df["days_since_start"].min()
        t_max_data = sub_df["days_since_start"].max()
        t_span_data = t_max_data - t_min_data if len(sub_df) > 1 else 0
        t_margin = margin_scale * t_span_data if t_span_data > 1e-6 else 0.1
        t_min_lim = t_min_data - t_margin
        t_max_lim = t_max_data + t_margin
        ax_scatter.set_xlim(t_min_lim, t_max_lim)

        favg_min_data = sub_df["f_avg"].min()
        favg_max_data = sub_df["f_avg"].max()
        y_span_data = favg_max_data - favg_min_data if len(sub_df) > 1 else 0
        y_margin = margin_scale * y_span_data if y_span_data > 1e-6 else 0.5
        favg_min_lim = favg_min_data - y_margin * 1.1 # Add slightly more margin below
        favg_max_lim = favg_max_data + y_margin * 1.1 # Add slightly more margin above
        ax_scatter.set_ylim(favg_min_lim, favg_max_lim)

        # Labels and ticks (DO NOT CHANGE THIS PART's LOGIC for labels/ticks)
        ax_scatter.set_xlabel("Days Since Start")
        ax_scatter.set_ylabel("Average Flux")
        # Dynamic tick locator based on span
        t_locator = mticker.MaxNLocator(nbins=8, prune='both') # Auto ticks for time
        ax_scatter.xaxis.set_major_locator(t_locator)
        ax_scatter.ticklabel_format(style='plain', useOffset=False, axis='x')
        ax_scatter.ticklabel_format(style='plain', useOffset=False, axis='y')
        ax_scatter.yaxis.set_major_locator(mticker.MaxNLocator(nbins='auto', prune='both')) # Auto Y ticks
        ax_scatter.grid(True, linestyle=':', alpha=0.6) # Add light grid

        # Rotate x-axis labels if they might overlap
        plt.setp(ax_scatter.get_xticklabels(), rotation=30, ha="right")


        # --- Image Subplots ---
        if n_img > 0: # Only relevant if not noise plot and images are available/matched
            print(f"   => Plotting {n_img} associated image(s)...")
            # Determine image value range for consistent contrast (optional)
            # all_img_data_values = []
            # for _, _, _, _, img in matching_images:
            #      if img is not None: all_img_data_values.extend(img.flatten())
            # if all_img_data_values:
            #     vmin, vmax = np.percentile(all_img_data_values, [1, 99]) # Example contrast stretch
            # else:
            #     vmin, vmax = 0, 255

            for i_img, (fidx2, fname2, dobs2, tobs2, img_8u) in enumerate(matching_images):
                 # Check if img_8u is valid before proceeding
                 if img_8u is None or img_8u.size <= 1:
                     print(f"      Skipping display for image index {fidx2} (fname: {fname2}) - Invalid image data.")
                     continue # Skip this iteration if image data is invalid

                 row_i = 1 + (i_img // img_cols) # Start from the second row of GridSpec
                 col_i = i_img % img_cols
                 if row_i < gs_rows: # Check bounds just in case
                     ax_img = fig.add_subplot(gs[row_i, col_i])
                     # Use vmin/vmax for consistent contrast if calculated
                     # ax_img.imshow(img_8u, cmap='gray', origin='lower', vmin=vmin, vmax=vmax)
                     ax_img.imshow(img_8u, cmap='gray', origin='lower') # Default contrast

                     # Add text overlay inside the image (using original fidx2)
                     ax_img.text(
                         IMAGE_NUMBER_X_OFFSET,    # X position (pixels from left)
                         #img_8u.shape[0] - IMAGE_NUMBER_Y_OFFSET, # Y pos (pixels from BOTTOM) - Adjusted from original
                         IMAGE_NUMBER_Y_OFFSET, # Y position (pixels from TOP) - Reverted to original request
                         f"{fidx2}",              # Text to display (Original file index)
                         color='white',           # Text color
                         fontsize=IMAGE_NUMBER_FONTSIZE,
                         ha='left',               # Horizontal alignment
                         #va='bottom',             # Vertical alignment (relative to Y pos from bottom)
                         va='top',                # Vertical alignment (relative to Y pos from top)
                         bbox=dict(facecolor='black', alpha=0.6, pad=0.2, boxstyle='round,pad=0.2') # Background
                     )
                     ax_img.axis("off") # Hide axes for the image subplot
        # Note: Removed the separate "No matching images" print here, handled earlier/implicitly

        # --- Final Figure Adjustments ---
        # Adjust layout to prevent labels/titles overlapping
        # tight_layout might interfere with external legend, use subplots_adjust
        # Adjust 'right' based on whether legend is shown
        right_margin = 0.82 if show_legend else 0.95
        fig.subplots_adjust(left=0.08, right=right_margin, bottom=0.15, top=0.92, wspace=0.1, hspace=0.1)

        plt.show() # Display the combined figure

        # Optional: Print list of files shown
        if not is_noise_plot and filename_col_present and have_cropped_images and n_img > 0:
            print(f"   => Images shown correspond to files (Sorted by time within cluster):")
            for (fidx2, xxfn, ddt, ddt2, _) in matching_images:
                print(f"       * File #{fidx2} => {xxfn} => {ddt} {ddt2}")

        print("   ----------------------------------------------------\\n")

    print(f"===== End of Plotting Process ({len(plot_items)} plots generated) =====")


# =============================================================================
#                       EXAMPLE USAGE SECTION
# =============================================================================
# IMPORTANT: Replace placeholder data generation with your actual data loading
# Ensure df_flux_merged and cropped_images are defined before this call

# --- GENERATE PLACEHOLDER DATA (if needed for testing) ---
import datetime # Added for placeholder data creation if not imported earlier
if 'df_flux_merged' not in locals() or 'cropped_images' not in locals():
     print("\\n--- WARNING: Generating Placeholder Data for Example ---")
     # Placeholder df_flux_merged DataFrame
     base_time = datetime.datetime(2024, 1, 1)
     time_points = [base_time + datetime.timedelta(days=x*0.5 + np.random.rand()*0.1) for x in range(20)]
     time_points.extend([base_time + datetime.timedelta(days=15+x*0.4 + np.random.rand()*0.1) for x in range(15)]) # Another cluster
     time_points.extend([base_time + datetime.timedelta(days=np.random.uniform(0, 25)) for x in range(5)]) # Some noise points
     start_time = min(time_points)
     days_since_start = [(t - start_time).total_seconds() / (24 * 3600) for t in time_points]
     f_avg = [10 + 1.5*np.sin(2*np.pi*d/5 + 1) + np.random.normal(0, 0.2) if d < 12 else
              (8 - 1.0*np.sin(2*np.pi*d/4 + 3) + np.random.normal(0, 0.15) if d > 14 and d < 22 else
               np.random.uniform(7, 12))
              for d in days_since_start]
     filenames = [f"image_{i+1:03d}.fits" for i in range(len(time_points))]
     df_flux_merged = pd.DataFrame({
          'datetime': time_points, 'f_avg': f_avg,
          'days_since_start': days_since_start, 'filename': filenames
     })
     # Placeholder cropped_images list: (original_fidx, filename, date, time, image_array)
     cropped_images = []
     for i, row in df_flux_merged.iterrows():
          img_array = (np.random.rand(50, 50) * 255).astype(np.uint8)
          date_obs = row['datetime'].strftime('%Y-%m-%d')
          time_obs = row['datetime'].strftime('%H:%M:%S')
          cropped_images.append((i + 1, row['filename'], date_obs, time_obs, img_array))
     print("--- Placeholder Data Generated ---\\n")
# --- END PLACEHOLDER DATA ---


# --- Call the main function ---
# NOTE: Adjust eps, min_samples, and merges based on YOUR data and DBSCAN results
run_dbscan_and_plot_with_images(
    df_flux=df_flux_merged,         # Your DataFrame
    eps=1.0,                       # Cluster radius in 'days_since_start' (ADJUST THIS)
    min_samples=3,                 # Minimum points to form a cluster (ADJUST THIS)
    merges=None, # Example merge: [[1, 2], [4, 5]] # Optional: Adjust cluster numbers based on DBSCAN output
    cropped_images=cropped_images  # Your list of image tuples (original_fidx, fname, date, time, img)
)
# =============================================================================

print("=== END CELL 11A (with Sine Fit) ===\n\n\n")

In [None]:
###############################################################################
#                             CELL 11B
###############################################################################
print("=== BEGIN CELL 11B: DBSCAN + Image Plotting (Delta) ===\n")

def run_dbscan_and_plot_with_images_delta(
    df_delta,
    eps=0.25,
    min_samples=3,
    margin_scale=0.1,
    use_merge_plots=True,
    merges=None,
    cropped_images=None
):
    print("\n==========================================")
    print("    DBSCAN + Image Plotting (Delta)")
    print("==========================================\n")

    required_cols = ["datetime", "delta_au", "days_since_start"]
    for c in required_cols:
        if c not in df_delta.columns:
            print(f"ERROR: Missing column '{c}' in df_delta! Cannot proceed.")
            return

    print("STEP 1: Checking df_delta sample (first 5 rows):")
    print(df_delta.head(5))

    print("\nSTEP 2: Checking for NaN in 'delta_au' or 'days_since_start':")
    nan_count_delta = df_delta["delta_au"].isna().sum()
    nan_count_days  = df_delta["days_since_start"].isna().sum()
    if nan_count_delta > 0 or nan_count_days > 0:
        print(f"WARNING: Found {nan_count_delta} NaN in 'delta_au' and {nan_count_days} in 'days_since_start'.")
    else:
        print("No NaN values found.")

    print("\nSTEP 3: Range check for 'days_since_start':")
    days_min = df_delta["days_since_start"].min()
    days_max = df_delta["days_since_start"].max()
    print(f"days_since_start => min: {days_min:.3f}, max: {days_max:.3f}")

    df_delta = df_delta.copy()
    df_delta["index_number"] = np.arange(len(df_delta)) + 1

    print(f"\nSTEP 4: Running DBSCAN (eps={eps}, min_samples={min_samples})...")
    X = df_delta["days_since_start"].values.reshape(-1, 1)
    try:
        db = DBSCAN(eps=eps, min_samples=min_samples)
        cluster_labels = db.fit_predict(X)
    except Exception as e:
        print(f"ERROR: DBSCAN failed: {e}")
        return

    df_delta["cluster"] = cluster_labels
    df_delta.loc[df_delta["cluster"] >= 0, "cluster"] = df_delta["cluster"] + 1

    clusters = df_delta[df_delta["cluster"] != -1]["cluster"].unique()
    clusters.sort()

    print("DBSCAN completed.")
    print(f"Total data points: {len(df_delta)}")
    n_noise = sum(cluster_labels == -1)
    print(f"Noise points: {n_noise}")
    print(f"Valid cluster labels (excluding -1): {clusters}")

    print("\nSTEP 5: Summarizing clusters:")
    if len(clusters) == 0:
        print("No valid clusters (all noise?).")
        return

    for label in clusters:
        cdata = df_delta[df_delta["cluster"] == label]
        cmin = cdata["days_since_start"].min()
        cmax = cdata["days_since_start"].max()
        csize = len(cdata)
        print(f"  Cluster {label}: size={csize}, time-range=({cmin:.2f}, {cmax:.2f})")

    if merges is None:
        merges = []
    merges = [set(m) for m in merges]
    merged_clusters = set().union(*merges)

    plot_items = []
    used = set()
    for lbl in clusters:
        if lbl in used:
            continue
        belongs_to = None
        for mset in merges:
            if lbl in mset:
                belongs_to = mset
                break
        if belongs_to is not None:
            plot_items.append(belongs_to)
            used.update(belongs_to)
        else:
            plot_items.append({lbl})
            used.add(lbl)

    print("\nSTEP 6: Creating cluster subplots + images in one figure...")

    filename_col_present = "filename" in df_delta.columns
    if not filename_col_present:
        print("NOTE: 'filename' column not in df_delta. Will skip image matching.")

    have_cropped_images = (cropped_images is not None) and (len(cropped_images) > 0)
    if not have_cropped_images:
        print("NOTE: 'cropped_images' is None or empty. Will skip image display.\n")

    for item in plot_items:
        label_list = sorted(item)
        sub_df = df_delta[df_delta["cluster"].isin(label_list)].copy()
        if len(sub_df) == 0:
            print(f"No data for clusters {label_list}, skipping.")
            continue

        snippet_data = sub_df.head(5)
        print(f"\n   => Plotting cluster(s) {label_list}, total points: {len(sub_df)}")
        print("   First 5 lines (with index_number):")
        print(snippet_data[["index_number","days_since_start","delta_au"]].to_string(index=False))

        matching_images = []
        if filename_col_present and have_cropped_images:
            valid_filenames = set(sub_df["filename"].unique())
            matching_images = [
                (fidx,fname,dobs,tobs,img_8u)
                for (fidx,fname,dobs,tobs,img_8u) in cropped_images
                if fname in valid_filenames
            ]

        n_img = len(matching_images)
        img_rows = math.ceil(n_img / 8)
        fig_width = 25
        fig_height = 6 + 3 * img_rows

        fig = plt.figure(figsize=(fig_width, fig_height))
        gs = GridSpec(nrows=img_rows + 1, ncols=8,
                      height_ratios=[6] + [3] * img_rows,
                      figure=fig)

        ax_scatter = fig.add_subplot(gs[0, :])
        colors = plt.cm.tab10(np.linspace(0,1,len(label_list)))

        for i, lbl2 in enumerate(label_list):
            cdata = sub_df[sub_df["cluster"] == lbl2]
            ax_scatter.scatter(
                cdata["days_since_start"],
                cdata["delta_au"],
                color=colors[i],
                alpha=0.75,
                s=POINT_SIZE,                # <-- SIZE OF DATA POINTS
                label=f"Cluster {lbl2}"
            )
            for _, row in cdata.iterrows():
                ax_scatter.text(
                    row["days_since_start"],
                    row["delta_au"],
                    str(row["index_number"]),
                    fontsize=ANNOTATION_FONTSIZE,  # <-- FONT SIZE OF POINT ANNOTATION
                    ha='left',
                    va='bottom'
                )

        if len(label_list)>1:
            ax_scatter.set_title(f"Merged Clusters {label_list}", fontsize=14)
        else:
            ax_scatter.set_title(f"Cluster {label_list[0]}", fontsize=14)

        # x-limits
        t_min = sub_df["days_since_start"].min()
        t_max = sub_df["days_since_start"].max()
        span = t_max - t_min
        if span <= 0:
            t_min -= 0.1
            t_max += 0.1
        else:
            margin = margin_scale * span
            t_min -= margin
            t_max += margin
        ax_scatter.set_xlim(t_min, t_max)

        # y-limits
        d_min = sub_df["delta_au"].min()
        d_max = sub_df["delta_au"].max()
        y_span = d_max - d_min
        if y_span <= 0:
            d_min -= 0.5
            d_max += 0.5
        else:
            y_margin = margin_scale * y_span
            d_min -= y_margin
            d_max += y_margin
        ax_scatter.set_ylim(d_min, d_max)

        ax_scatter.set_xlabel("Days Since Start")
        ax_scatter.set_ylabel("Delta (AU)")
        if len(label_list)>1:
            ax_scatter.legend()

        # Force 0.2-day intervals on X; no scientific notation on both axes
        ax_scatter.xaxis.set_major_locator(mticker.MultipleLocator(0.1))
        ax_scatter.ticklabel_format(style='plain', useOffset=False, axis='x')
        ax_scatter.ticklabel_format(style='plain', useOffset=False, axis='y')
        ax_scatter.yaxis.set_major_locator(mticker.MaxNLocator(nbins='auto'))

        # Place images
        for i_img, (fidx2, fname2, dobs2, tobs2, img_8u) in enumerate(matching_images):
            row_i = 1 + (i_img // 8)
            col_i = i_img % 8
            ax_img = fig.add_subplot(gs[row_i, col_i])
            ax_img.imshow(img_8u, cmap='gray', origin='lower')
            ax_img.set_title(f"File #{fidx2}", fontsize=FILE_NUMBER_FONTSIZE)  # <-- FILE # FONT SIZE
            ax_img.axis("off")

        plt.tight_layout()
        plt.show()

        if filename_col_present and have_cropped_images:
            print(f"   => Found {len(matching_images)} image(s) for clusters {label_list}.")
            for (xxid, xxfn, xxdt, xxt2, _) in matching_images:
                print(f"       * File #{xxid} => {xxfn} => {xxdt} {xxt2}")

        print("   ----------------------------------------------------\n")

    print("===== End of DBSCAN + Image Plotting (Delta) (11B) =====\n")


# Actually run 11B code on df_delta_merged
run_dbscan_and_plot_with_images_delta(
    df_delta_merged,
    eps=0.5,
    min_samples=1,
    margin_scale=0.1,
    use_merge_plots=True,
    merges=[[11,12,13]],
    cropped_images=cropped_images
)
print("=== END CELL 11B ===\n\n\n")

In [None]:
###############################################################################
#                             CELL 11C
###############################################################################
print("=== BEGIN CELL 11C: DBSCAN + Image Plotting (Surface Brightness) ===\n")

def run_dbscan_and_plot_with_images_sb(
    df_sb,
    eps=0.25,
    min_samples=3,
    margin_scale=0.1,
    use_merge_plots=True,
    merges=None,
    cropped_images=None
):
    print("\n==========================================")
    print("    DBSCAN + Image Plotting for S_BRT")
    print("==========================================\n")

    required_cols = ["datetime", "s_brt", "days_since_start"]
    for c in required_cols:
        if c not in df_sb.columns:
            print(f"ERROR: Missing column '{c}' in df_sb! Cannot proceed.")
            return

    print("STEP 1: Checking df_sb sample (first 5 rows):")
    print(df_sb.head(5))

    nan_count_sbrt = df_sb["s_brt"].isna().sum()
    nan_count_days = df_sb["days_since_start"].isna().sum()
    if nan_count_sbrt > 0 or nan_count_days > 0:
        print(f"WARNING: Found {nan_count_sbrt} NaN in 's_brt' and {nan_count_days} in 'days_since_start'.")
    else:
        print("No NaN values found.")

    days_min = df_sb["days_since_start"].min()
    days_max = df_sb["days_since_start"].max()
    print(f"\nTime range => min: {days_min:.3f}, max: {days_max:.3f}")

    df_sb = df_sb.copy()
    df_sb["index_number"] = np.arange(len(df_sb)) + 1

    print(f"\nRunning DBSCAN (eps={eps}, min_samples={min_samples}) ...")
    X = df_sb["days_since_start"].values.reshape(-1,1)
    db = DBSCAN(eps=eps, min_samples=min_samples)
    cluster_labels = db.fit_predict(X)

    df_sb["cluster"] = cluster_labels
    df_sb.loc[df_sb["cluster"] >= 0, "cluster"] = df_sb["cluster"] + 1

    clusters = df_sb[df_sb["cluster"] != -1]["cluster"].unique()
    clusters.sort()

    print("DBSCAN done. Total points:", len(df_sb))
    n_noise = sum(cluster_labels == -1)
    print(f"Noise points: {n_noise}")
    print("Valid cluster labels:", clusters)

    if len(clusters) == 0:
        print("No valid clusters.")
        return

    print("\nCluster summaries:")
    for label in clusters:
        cdata = df_sb[df_sb["cluster"] == label]
        cmin = cdata["days_since_start"].min()
        cmax = cdata["days_since_start"].max()
        print(f"  Cluster {label}: {len(cdata)} points, time-range=({cmin:.2f},{cmax:.2f})")

    if merges is None:
        merges = []
    merges = [set(m) for m in merges]
    merged_clusters = set().union(*merges)

    plot_items = []
    used = set()
    for lbl in clusters:
        if lbl in used:
            continue
        belongs_to = None
        for mset in merges:
            if lbl in mset:
                belongs_to = mset
                break
        if belongs_to is not None:
            plot_items.append(belongs_to)
            used.update(belongs_to)
        else:
            plot_items.append({lbl})
            used.add(lbl)

    filename_col_present = ("filename" in df_sb.columns)
    have_cropped_images = (cropped_images is not None) and (len(cropped_images) > 0)

    for item in plot_items:
        label_list = sorted(item)
        sub_df = df_sb[df_sb["cluster"].isin(label_list)].copy()
        if sub_df.empty:
            continue

        print(f"\nPlotting cluster(s) {label_list}, total points: {len(sub_df)}")
        print("First 5 lines:")
        print(sub_df[["index_number","days_since_start","s_brt"]].head(5))

        matching_images = []
        if filename_col_present and have_cropped_images:
            valid_filenames = set(sub_df["filename"].unique())
            matching_images = [
                (fidx,fname,dobs,tobs,img_8u)
                for (fidx,fname,dobs,tobs,img_8u) in cropped_images
                if fname in valid_filenames
            ]

        n_img = len(matching_images)
        img_rows = math.ceil(n_img / 8)
        fig = plt.figure(figsize=(25, 6 + 3 * img_rows))
        gs = GridSpec(nrows=img_rows + 1, ncols=8,
                      height_ratios=[6] + [3] * img_rows,
                      figure=fig)

        ax_scatter = fig.add_subplot(gs[0,:])
        colors = plt.cm.tab10(np.linspace(0,1,len(label_list)))

        for i, lbl3 in enumerate(label_list):
            cdata = sub_df[sub_df["cluster"] == lbl3]
            ax_scatter.scatter(
                cdata["days_since_start"],
                cdata["s_brt"],
                color=colors[i],
                alpha=0.75,
                s=POINT_SIZE,              # <-- SIZE OF DATA POINTS
                label=f"Cluster {lbl3}"
            )
            for _, row in cdata.iterrows():
                ax_scatter.text(
                    row["days_since_start"],
                    row["s_brt"],
                    str(row["index_number"]),
                    fontsize=ANNOTATION_FONTSIZE,  # <-- FONT SIZE OF POINT ANNOTATION
                    ha='left',
                    va='bottom'
                )

        if len(label_list)>1:
            ax_scatter.set_title(f"Merged Clusters {label_list} (SB vs Time)", fontsize=14)
        else:
            ax_scatter.set_title(f"Cluster {label_list[0]} (SB vs Time)", fontsize=14)

        # X-limits
        t_min = sub_df["days_since_start"].min()
        t_max = sub_df["days_since_start"].max()
        span = t_max - t_min
        if span <= 0:
            t_min -= 0.1
            t_max += 0.1
        else:
            t_min -= margin_scale * span
            t_max += margin_scale * span
        ax_scatter.set_xlim(t_min, t_max)

        # Y-limits
        sb_min = sub_df["s_brt"].min()
        sb_max = sub_df["s_brt"].max()
        sb_span = sb_max - sb_min
        if sb_span <= 0:
            sb_min -= 0.1
            sb_max += 0.1
        else:
            sb_min -= margin_scale * sb_span
            sb_max += margin_scale * sb_span
        ax_scatter.set_ylim(sb_min, sb_max)

        ax_scatter.set_xlabel("Days Since Start")
        ax_scatter.set_ylabel("Surface Brightness (s_brt)")
        if len(label_list)>1:
            ax_scatter.legend()

        # Force 0.2-day intervals on X; no scientific notation on both axes
        ax_scatter.xaxis.set_major_locator(mticker.MultipleLocator(0.1))
        ax_scatter.ticklabel_format(style='plain', useOffset=False, axis='x')
        ax_scatter.ticklabel_format(style='plain', useOffset=False, axis='y')
        ax_scatter.yaxis.set_major_locator(mticker.MaxNLocator(nbins='auto'))

        # Place images
        for i_img, (fidx3, fname3, dobs3, tobs3, img_8u) in enumerate(matching_images):
            row_i = 1 + (i_img // 8)
            col_i = i_img % 8
            ax_img = fig.add_subplot(gs[row_i,col_i])
            ax_img.imshow(img_8u, cmap='gray', origin='lower')
            ax_img.set_title(f"File #{fidx3}", fontsize=FILE_NUMBER_FONTSIZE)  # <-- FILE # FONT SIZE
            ax_img.axis("off")

        plt.tight_layout()
        plt.show()

        print(f" => Found {len(matching_images)} matching image(s).")
        for (xid, xfn, xdt, xdt2, _) in matching_images:
            print(f"     * File #{xid} => {xfn} => {xdt} {xdt2}")

    print("\n===== End of DBSCAN + Image Plotting for S_BRT (Cell 11C) =====\n")


# Actually run 11C code on df_sb_merged
run_dbscan_and_plot_with_images_sb(
    df_sb_merged,
    eps=0.5,
    min_samples=1,
    merges=[[11,12,13]],
    cropped_images=cropped_images
)

print("=== END CELL 11C ===\n")

# ====================== END OF GRAND CELL ======================

CELL 12 - an overlay of the graphs that have been made for cell 12 all in one big cell to see how the points have been moving across each other and to spot any correlations

In [None]:
# CELL 12: Triple-Axis Overlay: f_avg, delta_au, s_brt vs. Time
# Requirements:
#   df_all MUST have columns: ['datetime','days_since_start','f_avg','delta_au','s_brt','filename'(opt)]
#   'filename' aligns with cropped_images to see images.

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.cluster import DBSCAN

df_all = (df_flux[["datetime","days_since_start","f_avg","filename"]]
          .merge(df_delta[["datetime","delta_au"]], on="datetime", how="inner")
          .merge(df_sb[["datetime","s_brt"]],       on="datetime", how="inner"))

def run_dbscan_and_plot_with_images_triple(
    df_all,
    eps=0.25,
    min_samples=3,
    margin_scale=0.1,
    use_merge_plots=True,
    merges=None,
    cropped_images=None
):
    print("\n==========================================")
    print("    DBSCAN + Triple-Axis Plot + Images")
    print("==========================================\n")

    needed = ["datetime", "days_since_start", "f_avg", "delta_au", "s_brt"]
    for c in needed:
        if c not in df_all.columns:
            print(f"ERROR: missing '{c}' column in df_all.")
            return

    print("STEP 1: Checking df_all sample (first 5 rows):")
    print(df_all.head(5))

    # Check for NaNs
    nan_favg  = df_all["f_avg"].isna().sum()
    nan_delta = df_all["delta_au"].isna().sum()
    nan_sbrt  = df_all["s_brt"].isna().sum()
    nan_days  = df_all["days_since_start"].isna().sum()
    if (nan_favg + nan_delta + nan_sbrt + nan_days) > 0:
        print(f"WARNING: NaNs => f_avg({nan_favg}), delta({nan_delta}), s_brt({nan_sbrt}), days({nan_days}).")

    df_all = df_all.copy()
    df_all["index_number"] = np.arange(len(df_all)) + 1

    print(f"\nSTEP 2: DBSCAN on 'days_since_start' (eps={eps}, min_samples={min_samples})...")
    X = df_all["days_since_start"].values.reshape(-1,1)
    db = DBSCAN(eps=eps, min_samples=min_samples)
    cluster_labels = db.fit_predict(X)

    df_all["cluster"] = cluster_labels
    df_all.loc[df_all["cluster"] >= 0, "cluster"] = df_all["cluster"] + 1
    clusters = df_all[df_all["cluster"] != -1]["cluster"].unique()
    clusters.sort()

    print("DBSCAN done. Total points:", len(df_all))
    n_noise = sum(cluster_labels == -1)
    print(f"Noise points: {n_noise}")
    print("Valid cluster labels (excluding -1):", clusters)

    if len(clusters) == 0:
        print("No valid clusters found.")
        return

    print("\nSTEP 3: Summaries of clusters:")
    for label in clusters:
        cdata = df_all[df_all["cluster"] == label]
        cmin = cdata["days_since_start"].min()
        cmax = cdata["days_since_start"].max()
        print(f"  Cluster {label}: {len(cdata)} pts, time-range=({cmin:.2f}, {cmax:.2f})")

    if merges is None:
        merges = []
    merges = [set(m) for m in merges]

    plot_items = []
    used = set()
    for lbl in clusters:
        if lbl in used:
            continue
        belongs_to = None
        for mset in merges:
            if lbl in mset:
                belongs_to = mset
                break
        if belongs_to is not None:
            plot_items.append(belongs_to)
            used.update(belongs_to)
        else:
            plot_items.append({lbl})
            used.add(lbl)

    filename_col_present = ("filename" in df_all.columns)
    have_cropped_images = (cropped_images is not None) and (len(cropped_images) > 0)

    for item in plot_items:
        label_list = sorted(item)
        sub_df = df_all[df_all["cluster"].isin(label_list)].copy()
        if sub_df.empty:
            continue

        print(f"\n   => Plotting cluster(s) {label_list}, total points: {len(sub_df)}")
        snippet_data = sub_df.head(5)
        print("   First 5 lines (with index_number):")
        print(snippet_data[["index_number","days_since_start","f_avg","delta_au","s_brt"]])

        # images
        matching_images = []
        if filename_col_present and have_cropped_images:
            valid_filenames = set(sub_df["filename"].unique())
            matching_images = [
                (fidx,fname,dobs,tobs,img_8u)
                for (fidx,fname,dobs,tobs,img_8u) in cropped_images
                if fname in valid_filenames
            ]

        n_img = len(matching_images)
        img_rows = math.ceil(n_img/8)
        fig = plt.figure(figsize=(25, 6 + 3*img_rows))
        gs = GridSpec(nrows=img_rows+1, ncols=8,
                      height_ratios=[6] + [3]*img_rows,
                      figure=fig)

        # triple-axis
        ax1 = fig.add_subplot(gs[0,:])
        ax2 = ax1.twinx()
        ax3 = ax1.twinx()
        ax3.spines.right.set_position(("axes",1.035))

        col_favg  = 'blue'
        col_delta = 'orange'
        col_sbrt  = 'green'

        # f_avg
        for _, row in sub_df.iterrows():
            ax1.scatter(row["days_since_start"], row["f_avg"], color=col_favg, s=20, alpha=0.8)
            ax1.text(row["days_since_start"], row["f_avg"], str(row["index_number"]),
                     fontsize=8, ha='left', va='bottom', color=col_favg)

        # delta
        for _, row in sub_df.iterrows():
            ax2.scatter(row["days_since_start"], row["delta_au"], color=col_delta, s=20, alpha=0.8)
            ax2.text(row["days_since_start"], row["delta_au"], str(row["index_number"]),
                     fontsize=8, ha='left', va='bottom', color=col_delta)

        # s_brt
        for _, row in sub_df.iterrows():
            ax3.scatter(row["days_since_start"], row["s_brt"], color=col_sbrt, s=20, alpha=0.8)
            ax3.text(row["days_since_start"], row["s_brt"], str(row["index_number"]),
                     fontsize=8, ha='left', va='bottom', color=col_sbrt)

        # Titles
        if len(label_list)>1:
            ax1.set_title(f"Merged Clusters {label_list}: f_avg / delta / s_brt vs. Time", fontsize=14)
        else:
            ax1.set_title(f"Cluster {label_list[0]}: f_avg / delta / s_brt vs. Time", fontsize=14)

        ax1.set_xlabel("Days Since Start")
        ax1.set_ylabel("f_avg", color=col_favg)
        ax2.set_ylabel("delta_au", color=col_delta)
        ax3.set_ylabel("s_brt", color=col_sbrt)

        ax1.tick_params(axis='y', labelcolor=col_favg)
        ax2.tick_params(axis='y', labelcolor=col_delta)
        ax3.tick_params(axis='y', labelcolor=col_sbrt)

        # X-limits
        t_min = sub_df["days_since_start"].min()
        t_max = sub_df["days_since_start"].max()
        span = t_max - t_min
        if span<=0:
            t_min-=0.1
            t_max+=0.1
        else:
            t_min-= margin_scale*span
            t_max+= margin_scale*span
        ax1.set_xlim(t_min, t_max)

        # place images below
        for i_img, (fidx,fname,dobs,tobs,img_8u) in enumerate(matching_images):
            row_i = 1 + (i_img//8)
            col_i = i_img % 8
            ax_img = fig.add_subplot(gs[row_i,col_i])
            ax_img.imshow(img_8u, cmap='gray', origin='lower')
            ax_img.set_title(f"File #{fidx}", fontsize=8)
            ax_img.axis("off")

        plt.tight_layout()
        plt.show()

        print(f"   => Found {len(matching_images)} image(s) for clusters {label_list}.")
        for (fidx,fname,dobs,tobs,_) in matching_images:
            print(f"       * {fidx} => {fname} => {dobs} {tobs}")

    print("\n===== End of DBSCAN + Triple-Axis Plot + Images =====\n")

# Usage example (remember you must define df_all!):
# df_all = df_flux.merge(...) # etc
run_dbscan_and_plot_with_images_triple(
     df_all,
     eps=0.4,
     min_samples=1,
     merges=[[11,12,13]],
     cropped_images=cropped_images
)


CELL 13 - an image saving system that is necessary so that i can then use them to do the next part of my code that will run independently of the analysis script

In [None]:
# Cell 13: Image Saving & Numbering System

import os
import cv2

# Make sure you have access to the list "image_storage" from Cell 5

# 1) Create parent folder "images"
parent_folder = "images"
os.makedirs(parent_folder, exist_ok=True)  # doesn't fail if already exists

# 2) Create the 4 subfolders inside "images"
os.makedirs(os.path.join(parent_folder, "1 Original_Images"), exist_ok=True)
os.makedirs(os.path.join(parent_folder, "2 Rotated_Images"), exist_ok=True)
os.makedirs(os.path.join(parent_folder, "3 Contour_Images"), exist_ok=True)
os.makedirs(os.path.join(parent_folder, "4 Cropped_Images"), exist_ok=True)

# 3) Create counters for each image type
orig_counter = 1
rot_counter = 1
cont_counter = 1
crop_counter = 1

# 4) Iterate through image_storage to save each type
for entry in image_storage:
    i = entry["index"]
    file_name = entry["file_name"]
    date_obs = entry["date_obs"]
    time_obs = entry["time_obs"]

    # Convert date/time to a file-friendly format: e.g. "20230101_123456"
    if date_obs != "N/A" and time_obs != "N/A":
        date_str = date_obs.replace("-", "")  # "2023-01-01" -> "20230101"
        time_str = time_obs.replace(":", "")  # "12:34:56" -> "123456"
        dt_stamp = f"{date_str}_{time_str}"
    else:
        # If missing date/time, fallback to something like "NA"
        dt_stamp = "NA"

    orig_img = entry.get("orig_image")
    rot_img  = entry.get("rotated_image")
    cont_img = entry.get("contour_image")
    crop_img = entry.get("cropped_image")

    if orig_img is not None:
        orig_filename = f"orig_{orig_counter:03d}_{dt_stamp}.png"
        orig_path = os.path.join(parent_folder, "1 Original_Images", orig_filename)
        cv2.imwrite(orig_path, orig_img)
        orig_counter += 1

    if rot_img is not None:
        rot_filename = f"rot_{rot_counter:03d}_{dt_stamp}.png"
        rot_path = os.path.join(parent_folder, "2 Rotated_Images", rot_filename)
        cv2.imwrite(rot_path, rot_img)
        rot_counter += 1

    if cont_img is not None:
        cont_filename = f"cont_{cont_counter:03d}_{dt_stamp}.png"
        cont_path = os.path.join(parent_folder, "3 Contour_Images", cont_filename)
        cv2.imwrite(cont_path, cont_img)
        cont_counter += 1

    if crop_img is not None:
        crop_filename = f"crop_{crop_counter:03d}_{dt_stamp}.png"
        crop_path = os.path.join(parent_folder, "4 Cropped_Images", crop_filename)
        cv2.imwrite(crop_path, crop_img)
        crop_counter += 1

print("All images have been saved into the 'images' folder with numbering and timestamps!")

Here's a breakdown of what each of the 48 data points in the **Horizons API output** represents:

---

   **Position & Motion Data (1–6)**
1. **Astrometric RA & DEC** – Right Ascension and Declination of the target in the ICRF frame, without light-time correction.
2. **Apparent RA & DEC** – RA & DEC adjusted for light-time, stellar aberration, and gravitational deflection.
3. **Rates: RA & DEC** – Change in RA & DEC over time (arcsec/hour).
4. **Apparent AZ & EL** – Azimuth and elevation of the object from the observer’s location.
5. **Rates: AZ & EL** – Change in AZ & EL over time (arcsec/minute).
6. **Satellite X & Y, pos. angle** – Relative X and Y position of a satellite w.r.t. its primary body.

---

    **Time & Local Observational Data (7–11)**
7.  **Local Apparent Sidereal Time** – Observer’s sidereal time at local meridian.
8.  **Airmass & extinction** – Atmospheric extinction and relative optical airmass.
9.  **Visual mag. & Surface Brightness** – Apparent visual magnitude and surface brightness.
10. **Illuminated fraction** – Fraction of the object illuminated by the Sun (phase).
11. **Defect of illumination** – Angular width of the unilluminated portion of the object.

---

    **Angular Data (12–16)**
12. **Satellite angular separation/visibility** – Angular separation of a satellite from its primary.
13. **Target angular diameter** – Angular size of the target object.
14. **Observer sub-lon & sub-lat** – Sub-observer latitude and longitude on the target body.
15. **Sun sub-longitude & sub-latitude** – Sub-solar latitude and longitude on the target body.
16. **Sub-Sun position angle & distance** – Angular position of the sub-solar point and its separation from the observer's sub-point.

---

    **Orbital & Range Data (17–22)**
17. **North Pole position angle & distance** – Orientation and distance of the celestial north pole from the observer.
18. **Heliocentric ecliptic lon. & lat.** – Longitude and latitude of the target in the heliocentric ecliptic J2000 frame.
19. **Heliocentric range & range-rate** – Distance of the target from the Sun and its radial velocity.
20. **Observer range & range-rate** – Distance of the target from the observer and its radial velocity.
21. **One-way (down-leg) light-time** – Time taken for light to travel from the object to the observer.
22. **Speed wrt Sun & observer** – Target’s velocity relative to the Sun and the observer.

---

    **Elongation & Angles (23–28)**
23. **Sun-Observer-Target ELONG angle** – Angle between the Sun, observer, and target.
24. **Sun-Target-Observer ~PHASE angle** – Phase angle of the target as seen from the observer.
25. **Target-Observer-Moon angle/Illum%** – Angular separation between the target and the Moon and Moon's illumination fraction.
26. **Observer-Primary-Target angle** – Angle between the target, its primary body, and the observer.
27. **Sun-Target radial & velocity pos. angle** – Radial and velocity position angle between the Sun and target.
28. **Orbit plane angle** – Angle between the observer and the target’s orbital plane.

---
 
    **Constellation, Time & Reference Frames (29–34)**
29. **Constellation ID** – 3-letter abbreviation of the constellation in which the target appears.
30. **Delta-T (TDB - UT)** – Difference between Terrestrial Dynamical Time (TDB) and Universal Time (UT).
31. **Observer ecliptic lon. & lat.** – Ecliptic longitude and latitude of the target as seen by the observer.
32. **North pole RA & DEC** – Right Ascension and Declination of the target’s north pole.
33. **Galactic longitude & latitude** – Target’s position in the galactic coordinate system.
34. **Local apparent SOLAR time** – Local solar time at the observer’s location.

---

    **Uncertainty & Error Data (35–40)**
35. **Earth->obs. site light-time** – Light-time from Earth’s center to the observer’s site.
36. **RA & DEC uncertainty** – Uncertainty in the RA and DEC of the target.
37. **Plane-of-sky error ellipse** – Elliptical uncertainty region in the sky for the target’s position.
38. **POS uncertainty (RSS)** – Root-sum-square (RSS) uncertainty in the target’s position.
39. **Range & range-rate 3-sigmas** – 3σ uncertainty in the range and range rate of the target.
40. **Doppler & delay 3-sigmas** – 3σ uncertainty in the target’s Doppler and delay measurements.

---

    **Orbital & Physical Data (41–44)**
41. **True anomaly angle** – Target’s position in its orbit relative to perihelion.
42. **Local apparent hour angle** – Hour angle of the target as seen by the observer.
43. **PHASE angle & bisector** – Phase angle and bisector between the Sun and observer.
44. **Apparent longitude Sun (L_s)** – Solar longitude in the target’s coordinate system.

---

    **Apparent Motion Data (45–48)**
45. **Inertial apparent RA & DEC** – RA & DEC in the inertial ICRF frame.
46. **Rate: Inertial RA & DEC** – Change in RA & DEC in the ICRF frame.
47. **Sky motion: rate & angles** – Rate of target’s motion in the sky.
48. **Lunar sky-brightness & sky SNR** – Brightness of the sky due to moonlight and signal-to-noise ratio of the target.

---

This breakdown tells you what each data point represents in the Horizons API output. If you need further details or a focus on specific parameters, let me know!