In [23]:
import astropy
from astropy.io import fits
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from astropy.coordinates import SkyCoord
import astropy.units as u
from dustmaps.sfd import SFDQuery
import dustmaps.sfd

# RGB and Filtered RGB

In [2]:
sfd = SFDQuery()

In [3]:
merged = 'merged_data.fits'

with fits.open(merged) as hdul:
    hdul.info()
    columns = hdul[1].columns

Filename: merged_data.fits
No.    Name      Ver    Type      Cards   Dimensions   Format
  0  PRIMARY       1 PrimaryHDU       4   ()      
  1                1 BinTableHDU     80   17558141R x 36C   [K, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, L, D, D, K, D, D, D, D, D, D, D, D, I, D]   


In [15]:
pd.set_option('display.max_columns', None)

In [16]:
with fits.open(merged) as hdul:
    data = hdul[1].data  
    
    merged_df = pd.DataFrame({col.name: data[col.name].byteswap().newbyteorder() if data[col.name].dtype.byteorder == '>' else data[col.name]
                       for col in hdul[1].columns})

merged_df.head()

Unnamed: 0,source_id,l,b,ra,dec,parallax,parallax_error,pmra,pmra_error,pmdec,pmdec_error,ruwe,radial_velocity,radial_velocity_error,phot_g_mean_mag,phot_bp_mean_mag,phot_rp_mean_mag,catwise_w1,catwise_w2,mh_xgboost,teff_xgboost,logg_xgboost,in_training_sample,col1,col2,Source,RA_ICRS,DE_ICRS,rgeo,b_rgeo_x,B_rgeo_xa,rpgeo,b_rpgeo_x,B_rpgeo_xa,Flag,angDist
0,15741055975040,176.739184,-48.572035,45.136038,0.335043,1.439792,0.018947,-0.71128,0.017718,-1.412098,0.016528,1.036041,-0.738894,0.316921,10.254021,10.750235,9.595748,8.152,8.198,-0.144,5065.8,2.993,True,45.136038,0.335043,15741055975040,45.136038,0.335043,695.683899,683.627625,707.396423,696.27832,688.270874,707.143982,10033,0.0
1,25980257976960,176.369336,-48.732076,44.850926,0.398492,2.02193,0.225709,14.369971,0.287671,0.08376,0.203774,12.197247,47.34105,0.756424,11.72406,12.090828,10.976367,9.548,9.607,-0.083,5017.9,3.446,True,44.850926,0.398492,25980257976960,44.850926,0.398492,497.943695,440.956787,569.050537,486.625977,437.344543,566.133972,10033,0.0
2,66627828480768,176.483565,-48.171322,45.305053,0.736093,0.534038,0.020692,3.309832,0.022959,1.594356,0.022822,1.349013,41.60745,1.32343,10.50883,11.150953,9.741709,7.891,7.964,-0.401,4499.0,1.916,False,45.305053,0.736093,66627828480768,45.305053,0.736093,1884.27502,1821.24756,1947.30273,1883.14355,1807.3186,1944.66577,10033,0.0
3,82467667849472,176.209301,-48.607026,44.866246,0.561503,2.209985,0.016049,-4.869755,0.015797,-12.678339,0.01395,1.018742,-32.461674,0.205614,10.651456,11.169626,9.98061,8.496,8.558,0.114,4938.3,3.203,True,44.866246,0.561503,82467667849472,44.866246,0.561503,452.636078,448.701294,456.61554,452.602692,449.257355,455.432892,10033,0.0
4,101193725229056,175.755174,-48.727781,44.569524,0.689953,0.528788,0.024241,3.539184,0.027864,-1.599436,0.021891,1.176748,91.2959,4.830661,14.332739,14.816101,13.685076,12.22,12.275,-0.478,4980.2,3.319,False,44.569524,0.689953,101193725229056,44.569524,0.689953,1908.47827,1815.02417,2014.48389,1868.60388,1800.9408,1949.8313,10033,0.0


In [17]:
# Add fractional parallax uncertainty (fpu) column
merged_df['fpu'] = merged_df['parallax_error'] / merged_df['parallax']

# Filter based on fpu < 0.1
merged_df = merged_df[merged_df['fpu'] < 0.1]

In [18]:
# Filter based on Galactic latitude |b| > 10°
merged_df = merged_df[merged_df['b'].abs() > 10]

In [19]:
# Add E(B-V) values using the SFD dustmap and filter E(B-V) < 0.5
coords = SkyCoord(l=merged_df['l'].values * u.deg, b=merged_df['b'].values * u.deg, frame='galactic')
merged_df['E(B-V)'] = sfd(coords)
merged_df = merged_df[merged_df['E(B-V)'] < 0.5]

In [20]:
merged_df.shape

(3483206, 38)

In [22]:
merged_df.head()

Unnamed: 0,source_id,l,b,ra,dec,parallax,parallax_error,pmra,pmra_error,pmdec,pmdec_error,ruwe,radial_velocity,radial_velocity_error,phot_g_mean_mag,phot_bp_mean_mag,phot_rp_mean_mag,catwise_w1,catwise_w2,mh_xgboost,teff_xgboost,logg_xgboost,in_training_sample,col1,col2,Source,RA_ICRS,DE_ICRS,rgeo,b_rgeo_x,B_rgeo_xa,rpgeo,b_rpgeo_x,B_rpgeo_xa,Flag,angDist,fpu,E(B-V)
0,15741055975040,176.739184,-48.572035,45.136038,0.335043,1.439792,0.018947,-0.71128,0.017718,-1.412098,0.016528,1.036041,-0.738894,0.316921,10.254021,10.750235,9.595748,8.152,8.198,-0.144,5065.8,2.993,True,45.136038,0.335043,15741055975040,45.136038,0.335043,695.683899,683.627625,707.396423,696.27832,688.270874,707.143982,10033,0.0,0.01316,0.104689
2,66627828480768,176.483565,-48.171322,45.305053,0.736093,0.534038,0.020692,3.309832,0.022959,1.594356,0.022822,1.349013,41.60745,1.32343,10.50883,11.150953,9.741709,7.891,7.964,-0.401,4499.0,1.916,False,45.305053,0.736093,66627828480768,45.305053,0.736093,1884.27502,1821.24756,1947.30273,1883.14355,1807.3186,1944.66577,10033,0.0,0.038747,0.093026
3,82467667849472,176.209301,-48.607026,44.866246,0.561503,2.209985,0.016049,-4.869755,0.015797,-12.678339,0.01395,1.018742,-32.461674,0.205614,10.651456,11.169626,9.98061,8.496,8.558,0.114,4938.3,3.203,True,44.866246,0.561503,82467667849472,44.866246,0.561503,452.636078,448.701294,456.61554,452.602692,449.257355,455.432892,10033,0.0,0.007262,0.086075
4,101193725229056,175.755174,-48.727781,44.569524,0.689953,0.528788,0.024241,3.539184,0.027864,-1.599436,0.021891,1.176748,91.2959,4.830661,14.332739,14.816101,13.685076,12.22,12.275,-0.478,4980.2,3.319,False,44.569524,0.689953,101193725229056,44.569524,0.689953,1908.47827,1815.02417,2014.48389,1868.60388,1800.9408,1949.8313,10033,0.0,0.045842,0.078062
5,130399502833792,175.789759,-48.328584,44.868872,0.95508,0.847929,0.017479,3.382907,0.017619,-0.291235,0.015982,1.087555,1.959265,0.328298,11.215295,11.810178,10.481738,8.728,8.815,-0.159,4643.6,2.398,True,44.868872,0.95508,130399502833792,44.868872,0.95508,1178.40747,1155.31982,1203.71118,1177.68726,1154.2196,1201.32275,10033,0.0,0.020614,0.077163


In [23]:
data_array = np.array(
    list(merged_df.itertuples(index=False, name=None)),
    dtype=[(col, merged_df[col].dtype.type) for col in merged_df.columns]
)

hdu = fits.BinTableHDU(data_array)

output_fits_filename = 'high_quality_sample.fits'
hdu.writeto(output_fits_filename, overwrite=True)

print(f"FITS file saved as '{output_fits_filename}'")

FITS file saved as 'high_quality_sample.fits'


### high_quality_sample.fits is the dataset used in our analysis
### merged_data.fits is the RGB dataset
https://github.com/apace7/local_volume_database/tree/main

In [1]:
import numpy as np
from astropy.table import Table
from astropy.coordinates import SkyCoord
from astropy import units as u

# Step 1: Load your dataset
high_quality_sample = Table.read("high_quality_sample.fits")

In [2]:
dwarf_galaxies = Table.read("local_volume_database/data/dwarf_mw.csv")
globular_clusters = Table.read("local_volume_database/data/gc_harris.csv")

In [4]:
# Step 3: Extract RA, Dec from your dataset and known objects
sample_coords = SkyCoord(ra=high_quality_sample['ra'] * u.deg, dec=high_quality_sample['dec'] * u.deg)

# Dwarf galaxy coordinates
dwarf_coords = SkyCoord(ra=dwarf_galaxies['ra'] * u.deg, dec=dwarf_galaxies['dec'] * u.deg)

# Globular cluster coordinates
gc_coords = SkyCoord(ra=globular_clusters['ra'] * u.deg, dec=globular_clusters['dec'] * u.deg)

# Combine all known RA/Dec into one coordinate array
known_coords = SkyCoord(ra=np.concatenate([dwarf_coords.ra, gc_coords.ra]),
                        dec=np.concatenate([dwarf_coords.dec, gc_coords.dec]))


In [7]:
from tqdm import tqdm

# Compute angular separation for each star with respect to all known objects
angular_separations = np.array([
    sample_coords.separation(coord).to(u.deg).value  # Convert to degrees for clarity
    for coord in tqdm(known_coords, desc="Computing Angular Separations")
])

# Find the minimum separation for each star
min_separation = angular_separations.min(axis=0)  # Minimum separation for each star

# Filter: Keep only stars farther than 1 degree from known clusters/galaxies
filtered_sample = high_quality_sample[min_separation > 1]

Computing Angular Separations: 100%|██████████| 220/220 [00:21<00:00, 10.21it/s]


In [12]:
print(filtered_sample[:5])  # Displays the first 5 rows


   source_id            l          ...         fpu             E(B-V)  
--------------- ------------------ ... -------------------- -----------
 15741055975040 176.73918412075122 ... 0.013159694397383279 0.104689494
 66627828480768 176.48356548094657 ...  0.03874692865369786 0.093025796
 82467667849472  176.2093011960656 ... 0.007261927031374482   0.0860747
101193725229056 175.75517408467232 ...  0.04584230665742899  0.07806178
130399502833792  175.7897591660523 ... 0.020613814445839203   0.0771633


In [16]:
filtered_sample_df = filtered_sample.to_pandas()

In [21]:
filtered_sample_df.shape

(3404929, 38)

In [24]:
data_array = np.array(
    list(filtered_sample_df.itertuples(index=False, name=None)),
    dtype=[(col, filtered_sample_df[col].dtype.type) for col in filtered_sample_df.columns]
)

hdu = fits.BinTableHDU(data_array)

output_fits_filename = 'filtered_high_quality_sample.fits'
hdu.writeto(output_fits_filename, overwrite=True)

print(f"FITS file saved as '{output_fits_filename}'")

FITS file saved as 'filtered_high_quality_sample.fits'
