In [None]:
import pandas as pd
import numpy as np
import h5py
from astropy.io import fits
from astropy.table import Table
from astropy.coordinates import SkyCoord
import astropy.units as u
from tqdm import tqdm
import astropy
import matplotlib.pyplot as plt
import seaborn as sns
import galpy
from galpy.orbit import Orbit
from galpy.potential import MWPotential2014
from astropy.coordinates import SkyCoord, Galactocentric, CartesianDifferential, ICRS, Galactic, CylindricalRepresentation, CylindricalDifferential
from matplotlib.colors import Normalize
from galpy.util.coords import lbd_to_XYZ, XYZ_to_galcencyl
import warnings
pd.set_option('display.max_columns', None)

In [2]:
with h5py.File('../data/JDrgb_14m_hasrv.hdf5', 'r') as f:
    columns_group = f['table/columns']
    col_names = list(columns_group.keys())
    data_dict = {}

    for col in tqdm(col_names, desc="Reading HDF5 columns"):
        dataset = columns_group[col]
        subkey = list(dataset.keys())[0]
        data = dataset[subkey][:]
        if data.dtype.kind == 'S':
            data = data.astype(str)
        data_dict[col] = data

df_vis = pd.DataFrame(data_dict)
print(f"Loaded df_vis with shape {df_vis.shape}")

Reading HDF5 columns: 100%|██████████| 30/30 [00:00<00:00, 57.80it/s]


Loaded df_vis with shape (14485519, 30)


In [3]:
print("Loading FITS...")
with fits.open('../data/table_2_catwise.fits.gz', memmap=True) as hdul:
    data = hdul[1].data
    df_RGB = pd.DataFrame({
        col.name: data[col.name].byteswap().newbyteorder() if data[col.name].dtype.byteorder == '>' else data[col.name]
        for col in hdul[1].columns
    })

df_rgb_subset = df_RGB[['source_id', 'catwise_w1', 'catwise_w2', 'mh_xgboost', 'teff_xgboost', 'logg_xgboost']]


Loading FITS...


In [None]:
print("Merging dataframes...")
df_vis['source_id'] = df_vis['source_id'].astype(str)
df_rgb_subset = df_RGB[['source_id', 'catwise_w1', 'catwise_w2', 'mh_xgboost', 'teff_xgboost', 'logg_xgboost']].copy()
df_rgb_subset['source_id'] = df_rgb_subset['source_id'].astype(str)
df_merged = pd.merge(df_vis, df_rgb_subset, on='source_id', how='left')
print(f"Merged shape: {df_merged.shape}")

Merging dataframes...
Merged shape: (14485519, 35)


In [6]:
# Count how many rows have any missing values in the columns added from df_rgb_subset
num_missing = df_merged[['catwise_w1', 'catwise_w2', 'mh_xgboost', 'teff_xgboost', 'logg_xgboost']].isna().any(axis=1).sum()

print(f"[✓] Number of rows in df_vis without a match from df_rgb_subset: {num_missing}")


[✓] Number of rows in df_vis without a match from df_rgb_subset: 2507010


In [7]:
# Drop rows with missing values in any of the merged columns
df_merged = df_merged.dropna(subset=['catwise_w1', 'catwise_w2', 'mh_xgboost', 'teff_xgboost', 'logg_xgboost'])

print(f"[✓] Shape after dropping unmatched rows: {df_merged.shape}")


[✓] Shape after dropping unmatched rows: (11978509, 35)


In [8]:
print("Applying quality cuts...")

df_merged['parallax_over_error'] = df_merged['parallax'] / df_merged['parallax_error']

df_merged['MW1'] = df_merged['catwise_w1'] + 5 * np.log10(df_merged['parallax'] / 100)

df_merged['G'] = df_merged['phot_g_mean_mag']

df_merged['GBP'] = df_merged['phot_bp_mean_mag']

df_merged['W1'] = df_merged['catwise_w1']
df_merged['W2'] = df_merged['catwise_w2']

cut = (
    (df_merged['phot_g_mean_mag'] < 16) &
    (df_merged['parallax_over_error'] > 5) &
    (df_merged['teff_xgboost'] <= 5200) &
    (df_merged['logg_xgboost'] < 3.5) &
    (df_merged['MW1'] > (-0.3 - 0.006 * (5500 - df_merged['teff_xgboost']))) &
    (df_merged['MW1'] > (-0.01 * (5300 - df_merged['teff_xgboost']))) &
    ((df_merged['G'] - df_merged['W2']) < (0.2 + 0.77 * (df_merged['GBP'] - df_merged['W1'])))
)

df_cleaned = df_merged[cut].copy()
print(f"After cuts: {df_cleaned.shape}")

Applying quality cuts...
After cuts: (10456910, 41)


In [9]:
dwarf_galaxies = Table.read("../local_volume_database/data/dwarf_mw.csv")
globular_clusters = Table.read("../local_volume_database/data/gc_harris.csv")

dwarf_coords = SkyCoord(ra=dwarf_galaxies['ra'] * u.deg, dec=dwarf_galaxies['dec'] * u.deg)
gc_coords = SkyCoord(ra=globular_clusters['ra'] * u.deg, dec=globular_clusters['dec'] * u.deg)
known_coords = SkyCoord(
    ra=np.concatenate([dwarf_coords.ra.deg, gc_coords.ra.deg]) * u.deg,
    dec=np.concatenate([dwarf_coords.dec.deg, gc_coords.dec.deg]) * u.deg
)

print("Computing nearest matches...")
df_cleaned['ra'] = pd.to_numeric(df_cleaned['ra'], errors='coerce')
df_cleaned['dec'] = pd.to_numeric(df_cleaned['dec'], errors='coerce')
df_cleaned = df_cleaned.dropna(subset=['ra', 'dec'])

sample_coords = SkyCoord(ra=df_cleaned['ra'].values * u.deg, dec=df_cleaned['dec'].values * u.deg)

# Use efficient nearest neighbor match instead of all-pair separation
idx, sep2d, _ = sample_coords.match_to_catalog_sky(known_coords)

# Filter stars >1 degree away
df_final = df_cleaned[sep2d.deg > 1].copy()
print(f"After sky filtering: {df_final.shape}")

Computing nearest matches...
After sky filtering: (10060704, 41)


In [None]:
from astropy.table import Table

df_final['source_id'] = df_final['source_id'].astype(str)

# Convert to Astropy Table
table = Table.from_pandas(df_final)

# Save to FITS
output_fits_filename = '../data/vis_cleaned.fits'
table.write(output_fits_filename, format='fits', overwrite=True)

print(f"[✓] FITS file saved as '{output_fits_filename}'")


[✓] FITS file saved as '../data/vis_cleaned.fits'


In [None]:
table = Table.read("../data/vis_cleaned.fits", format='fits')

# Convert to a pandas DataFrame
df_vis = table.to_pandas()
df_vis['source_id'] = df_vis['source_id'].str.decode('utf-8').astype(np.int64)


In [3]:
df_vis.shape

(10060704, 41)

In [None]:
sample = pd.read_csv("../data/gedr3dist.dump.gz", sep="\t", nrows=5)
print("Columns in gedr3dist.dump.gz:")
print(sample.columns.tolist())

Columns in gedr3dist.dump.gz:
['source_id,r_med_geo,r_lo_geo,r_hi_geo,r_med_photogeo,r_lo_photogeo,r_hi_photogeo,flag']


In [None]:
# Paths
fits_path = "../data/vis_cleaned.fits"
dump_path = "../data/gedr3dist.dump.gz"
output_path = "../data/vis_with_distances.fits"

# Load vis_cleaned and prepare source_id set
df_vis = Table.read(fits_path, format='fits').to_pandas()
df_vis['source_id'] = df_vis['source_id'].astype(str)
source_set = set(df_vis['source_id'])
print(f"[✓] Loaded vis_cleaned: {len(df_vis):,} rows")

# Column names and types in the dump
columns = ['source_id', 'r_med_geo', 'r_lo_geo', 'r_hi_geo',
           'r_med_photogeo', 'r_lo_photogeo', 'r_hi_photogeo', 'flag']
dtype_dict = {
    "source_id": str,
    "r_med_geo": float,
    "r_lo_geo": float,
    "r_hi_geo": float,
    "r_med_photogeo": float,
    "r_lo_photogeo": float,
    "r_hi_photogeo": float,
    "flag": str
}

# Parameters
total_rows = 1_467_744_819  # known from gzcat | wc -l
chunk_size = 1_000_000
n_chunks = (total_rows - 1 + chunk_size - 1) // chunk_size  # -1 to exclude header

# Stream and match
print("Processing gedr3dist.dump.gz with source_id matching...")
matches = []
reader = pd.read_csv(
    dump_path, 
    sep=",", 
    names=columns, 
    dtype=dtype_dict, 
    skiprows=1, 
    chunksize=chunk_size
)

for chunk in tqdm(reader, total=n_chunks, desc="Matching chunks", unit="chunk"):
    matched = chunk[chunk['source_id'].isin(source_set)]
    matches.append(matched)

# Concatenate and merge
df_matched = pd.concat(matches, ignore_index=True)
print(f"[✓] Total matched rows: {len(df_matched):,}")

df_merged = pd.merge(df_vis, df_matched, on="source_id", how="left")
print(f"[✓] Final merged DataFrame shape: {df_merged.shape}")


[✓] Loaded vis_cleaned: 10,060,704 rows
Processing gedr3dist.dump.gz with source_id matching...


Matching chunks: 100%|██████████| 1468/1468 [1:12:25<00:00,  2.96s/chunk]


[✓] Total matched rows: 10,060,704
[✓] Final merged DataFrame shape: (10060704, 48)


In [9]:
# Drop rows where r_med_photogeo is NaN
df_filtered = df_merged.dropna(subset=['r_med_photogeo'])

print(f"[✓] Rows after dropping NaNs in r_med_photogeo: {len(df_filtered):,}")


[✓] Rows after dropping NaNs in r_med_photogeo: 10,060,164


In [None]:
# Save as FITS
table_out = Table.from_pandas(df_filtered)
table_out.write(output_path, format='fits', overwrite=True)
print(f"[✓] Saved merged file to: {output_path}")

[✓] Saved merged file to: ../data/vis_with_distances.fits


In [None]:
table = Table.read("../data/vis_with_distances.fits", format='fits')

# Convert to a pandas DataFrame
df_vis = table.to_pandas()

df_vis.head()


Unnamed: 0,aom_xp,b,bp_rp,dec,e_aom_xp,e_logg_xp,e_moh_xp,e_teff_xp,fake_MG,l,...,GBP,W1,W2,r_med_geo,r_lo_geo,r_hi_geo,r_med_photogeo,r_lo_photogeo,r_hi_photogeo,flag
0,0.0189,-48.572035,1.154534,0.335043,0.0222,0.1309,0.0717,53.5303,158.232347,176.739184,...,10.750277,8.152,8.198,695.683899,683.627625,707.396423,696.27832,688.270874,707.143982,b'10033'
1,0.1512,-48.171322,1.40929,0.736093,0.0502,0.1197,0.0759,44.4781,63.236201,176.483565,...,11.150994,7.891,7.964,1884.27502,1821.24756,1947.30273,1883.14355,1807.3186,1944.66577,b'10033'
2,0.0036,-48.607026,1.189063,0.561503,0.008,0.0742,0.0333,30.6331,293.932164,176.209301,...,11.169669,8.496,8.558,452.636078,448.701294,456.61554,452.602692,449.257355,455.432892,b'10033'
3,0.2962,-48.727781,1.131072,0.689953,0.0328,0.1724,0.0808,59.5872,356.97924,175.755174,...,14.816144,12.22,12.275,1908.47827,1815.02417,2014.48389,1868.60388,1800.9408,1949.8313,b'10033'
4,0.0726,-48.328584,1.328486,0.95508,0.0245,0.0762,0.0429,27.8866,143.555092,175.789759,...,11.81022,8.728,8.815,1178.40747,1155.31982,1203.71118,1177.68726,1154.2196,1201.32275,b'10033'


In [None]:
def calculate_galactocentric_manual(df):
    """
    Manually calculate Galactocentric coordinates (R and Z).

    Parameters:
    - df: DataFrame with necessary astrometric data.

    Returns:
    - DataFrame with added Galactocentric radius (R) and Z coordinates.
    """
    # Define constants
    R_sun = 8.122  # Distance of the Sun from the Galactic centre in kpc

    # Convert l and b from degrees to radians
    l_rad = np.radians(df['l'].values)
    b_rad = np.radians(df['b'].values)

    # Use rpgeo as distance in kpc
    d_kpc = df['r_med_photogeo'].values / 1000  # Convert from pc to kpc

    # Calculate Cartesian coordinates
    x = d_kpc * np.cos(b_rad) * np.cos(l_rad)
    y = d_kpc * np.cos(b_rad) * np.sin(l_rad)
    z = d_kpc * np.sin(b_rad)

    # Adjust for Sun's position relative to the Galactic centre
    x_galactocentric = x - R_sun

    # Calculate cylindrical radius R
    R = np.sqrt(x_galactocentric**2 + y**2)

    # Add results to the DataFrame
    df['R'] = R  # Galactocentric radius in kpc
    df['Z'] = z  # Height above the Galactic plane in kpc

    return df

# Apply the function to your DataFrame
df = calculate_galactocentric_manual(df_vis)

# Remove rows with NaN values in R or Z columns
df = df.dropna(subset=['R', 'Z'])

df.shape


(10060164, 50)

In [None]:
# Extract relevant columns
l = df['l'].values  # Galactic longitude in degrees
b = df['b'].values  # Galactic latitude in degrees
rpgeo = df['r_med_photogeo'].values  # Distance in parsecs

# Convert Galactic coordinates to Cartesian using galpy
xyz = lbd_to_XYZ(l, b, rpgeo, degree=True)
x, y, z = xyz.T  # Unpacking the array

# Transform to Galactocentric cylindrical coordinates
Xsun = 8.2  # Distance of the Sun from the Galactic Centre in kpc
Zsun = 0.025  # Sun's height above the midplane in kpc
R_phi_z = XYZ_to_galcencyl(x, y, z, Xsun=Xsun, Zsun=Zsun, _extra_rot=True)
R, phi, z_gal = R_phi_z.T  # Unpacking the results

# Add Galactocentric cylindrical coordinates to the DataFrame
df["R_gal"] = R
df["phi_gal"] = phi
df["Z_gal"] = z_gal

In [None]:
# Define the Sun's velocity with respect to the Galactic center
v_sun = CartesianDifferential([11.1, 245., 7.25] * u.km / u.s)

# Define the Galactocentric frame
gc_frame = Galactocentric(galcen_distance=8.1 * u.kpc, 
                          z_sun=25 * u.pc, 
                          galcen_v_sun=v_sun)

# Extract the columns as numpy arrays
ra = df['ra'].values * u.deg
dec = df['dec'].values * u.deg
distance = df['r_med_photogeo'].values * u.pc  
pmra = df['pmra'].values * u.mas / u.yr
pmdec = df['pmdec'].values * u.mas / u.yr
vlos = df['radial_velocity'].values * u.km / u.s

# Create a SkyCoord object for all sources at once
coords = ICRS(ra=ra, dec=dec, distance=distance, pm_ra_cosdec=pmra, pm_dec=pmdec, radial_velocity=vlos)

# Transform all coordinates to the Galactocentric frame
cg = coords.transform_to(gc_frame)
cg.representation= 'cylindrical'

# Ensure cylindrical position and velocity representations
cg_cyl = cg.represent_as(CylindricalRepresentation)  # Cylindrical position (rho, phi, z)
cg_cyl_vel = cg.represent_as(CylindricalRepresentation, CylindricalDifferential).differentials['s']  # Cylindrical velocity

# Convert d_phi from rad/yr to rad/s
d_phi_rad_s = cg_cyl_vel.d_phi.to(u.rad / u.s)

# Convert rho from pc to km
rho_km = cg_cyl.rho.to(u.km)

# Compute v_phi (linear azimuthal velocity in km/s)
v_phi_kms = -(d_phi_rad_s * rho_km)

df['v_phi'] = v_phi_kms.value

# Convert d_rho to km/s
v_r_kms = cg_cyl_vel.d_rho.to(u.km / u.s)

# Convert d_rho to km/s
df['v_R'] = cg_cyl_vel.d_rho.to(u.km / u.s).value

df['v_Z'] = cg_cyl_vel.d_z.to(u.km / u.s).value

df.head()


In [15]:
# Save as FITS
table_out = Table.from_pandas(df)
table_out.write('../data/vis_main.fits', format='fits', overwrite=True)
print(f"[✓] Saved file")


[✓] Saved file


In [4]:
from astropy.io import fits
import pandas as pd
import numpy as np

# Load the datasets
vis_main = fits.open('../data/vis_main.fits')[1].data
df_v_final = fits.open('../data/df_v_final.fits')[1].data


# Convert to DataFrames
df_vis_main = pd.DataFrame(np.array(vis_main))
df_v_final = pd.DataFrame(np.array(df_v_final))

# Convert source_id to integers if needed
df_vis_main['source_id'] = df_vis_main['source_id'].astype(str).astype(np.int64)
df_v_final['source_id'] = df_v_final['source_id'].astype(str).astype(np.int64)

# Find common source_ids
common_source_ids = np.intersect1d(df_vis_main['source_id'], df_v_final['source_id'])

# Output the result
print(f'Number of common source_id values: {len(common_source_ids)}')


Number of common source_id values: 2983932


In [5]:
df_vis_main.shape

(10060164, 56)

In [6]:
df_v_final.shape

(3404929, 51)