In [2]:
import sys
import os
sys.path.append("../") 

import pandas as pd
from astropy.table import Table
from src.data import (
    read_votable_to_dataframe,
    convert_byte_columns_to_str,
    filter_multiple_matches,
    get_most_probable_matches,
    get_second_most_probable_matches,
    get_last_probable_matches,
    prepare_final_dataset,
    include_classifications,
    read_gzipped_votable_to_dataframe
    )

%load_ext autoreload
%autoreload 2

In [5]:
# load Dong-Woo nway data
t_nway = Table.read('../data/nway_CSC21_GAIA3.fits', format='fits')
t_nway = convert_byte_columns_to_str(t_nway)
df_nway_all = t_nway.to_pandas()

In [7]:
# correct chandra ids
df_nway_all['CSC21_CSCID'] = df_nway_all['CSC21_CSCID'].str.replace('_', ' ')
df_nway_all['CSC21_CSCID'] = df_nway_all['CSC21_CSCID'].str.strip()

In [8]:
# generate dataframe of possible chandra matches

nway_csc21_possible_matches_count = pd.DataFrame(df_nway_all[['CSC21_CSCID']].value_counts(), columns=['count'])

In [5]:
# include them in df_nway_all 
df_nway_all = df_nway_all.merge(nway_csc21_possible_matches_count, left_on='CSC21_CSCID', right_on='CSC21_CSCID', how='left')

In [6]:
# before this, download the whole CSC21 with properties from CSCView.
df_csc_all = read_votable_to_dataframe('../../data/v3/csc_all_1.vot')

In [7]:
# filter nway crossmatches to only have those with >1 possible matches
df_filtered = filter_multiple_matches(df_nway_all, 'CSC21_CSCID')

In [8]:
# get the table of most probable matches based on p_i
df_most_probable = get_most_probable_matches(df_nway_all, 'CSC21_CSCID', 'p_i')

In [9]:
# get second most probable matches
df_second_most_probable = get_second_most_probable_matches(df_filtered, 'CSC21_CSCID', 'p_i')

In [10]:
# get last probable matches
df_last_probable = get_last_probable_matches(df_filtered, 'CSC21_CSCID', 'p_i')

In [11]:
# prepare final datasets
prepared_most_probable = prepare_final_dataset(df_most_probable, df_csc_all)
prepared_second_prob = prepare_final_dataset(df_second_most_probable, df_csc_all)
prepared_last_prob = prepare_final_dataset(df_last_probable, df_csc_all)

In [12]:
# read classification tables
yangetal_gcs = pd.read_csv('../../data/v3/yangetal_gcs.csv')
yangetal_training = pd.read_csv('../../data/v3/yangetal_training.csv')
perezdiazetal_class = pd.read_csv('../../data/v3/uniquely_classified.csv')

  yangetal_gcs = pd.read_csv('../../data/v3/yangetal_gcs.csv')


In [13]:
# include classifications
p_most_probable_with_class = include_classifications(
    prepared_most_probable,
    yangetal_gcs,
    base_col='csc21_name',
    additional_col='CSCv2_name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_gcs_class'}
)

p_most_probable_with_class = include_classifications(
    p_most_probable_with_class,
    yangetal_training,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_training_class'}
)

p_most_probable_with_class = include_classifications(
    p_most_probable_with_class,
    perezdiazetal_class,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['agg_master_class'],
    rename_columns={'agg_master_class': 'perezdiazetal_class'}
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)


In [14]:
# include classifications
p_second_most_probable_with_class = include_classifications(
    prepared_second_prob,
    yangetal_gcs,
    base_col='csc21_name',
    additional_col='CSCv2_name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_gcs_class'}
)

p_second_most_probable_with_class = include_classifications(
    p_second_most_probable_with_class,
    yangetal_training,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_training_class'}
)

p_second_most_probable_with_class = include_classifications(
    p_second_most_probable_with_class,
    perezdiazetal_class,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['agg_master_class'],
    rename_columns={'agg_master_class': 'perezdiazetal_class'}
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)


In [15]:
# include classifications
p_last_prob_with_class = include_classifications(
    prepared_last_prob,
    yangetal_gcs,
    base_col='csc21_name',
    additional_col='CSCv2_name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_gcs_class'}
)

p_last_prob_with_class = include_classifications(
    p_last_prob_with_class,
    yangetal_training,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_training_class'}
)

p_last_prob_with_class = include_classifications(
    p_last_prob_with_class,
    perezdiazetal_class,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['agg_master_class'],
    rename_columns={'agg_master_class': 'perezdiazetal_class'}
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)


In [16]:
p_most_probable_with_class.to_csv('../../out_data/v3/most_prob_class.csv', index=False)
p_second_most_probable_with_class.to_csv('../../out_data/v3/second_most_prob_class.csv', index=False)
p_last_prob_with_class.to_csv('../../out_data/v3/last_prob_class.csv', index=False)

# end

In [18]:
# this is for querying gaia archive, create a list of the gaia ids needed
# concatenate the 'gaia3_source_id' columns
all_gaia3_ids = pd.concat([
    prepared_most_probable['gaia3_source_id'],
    prepared_second_prob['gaia3_source_id'],
    prepared_last_prob['gaia3_source_id']
])

# unique gaia IDs
unique_gaia3_ids = all_gaia3_ids.unique()

# dataframe
unique_gaia3_ids_df = pd.DataFrame(unique_gaia3_ids, columns=['gaia3_source_id'])

# export
unique_gaia3_ids_df.to_csv('../../data/v3/unique_gaia3_ids.csv', index=False)

In [None]:
# unique gaia IDs
unique_gaia3_ids =  df_nway_all.GAIA3_source_id.unique()


# dataframe
unique_gaia3_ids_df = pd.DataFrame(unique_gaia3_ids, columns=['gaia3_source_id'])

# export
unique_gaia3_ids_df.to_csv('../out_data/unique_gaia3_ids_all.csv', index=False)

In [15]:
df_nway_all

Unnamed: 0,ENS,CSC21_CSCID,CSC21_RA,CSC21_Dec,CSC21_Errmaj,CSC21_Errmin,CSC21_ErrPA,GAIA3_source_id,GAIA3_ra,GAIA3_dec,...,dist_bayesfactor,dist_post,p_single,p_any,p_i,match_flag,EPOS1,EPOS2,EPOS,SEP_EPOS
0,ens0605500_001,2CXO J201627.1-071025,304.113190,-7.173878,0.277780,0.151145,125.300003,4216298682145136000,304.115139,-7.175327,...,-138.031204,0.000000e+00,0.000000e+00,0.967833,0.000000e+00,0,0.204903,0.000293,0.204903,42.459329
1,ens0605500_001,2CXO J201627.1-071025,304.113190,-7.173878,0.277780,0.151145,125.300003,4216298686436921088,304.112147,-7.176046,...,-456.438293,0.000000e+00,0.000000e+00,0.967833,0.000000e+00,0,0.204903,0.000296,0.204903,42.202300
2,ens0605500_001,2CXO J201627.1-071025,304.113190,-7.173878,0.277780,0.151145,125.300003,4216298686440345088,304.113227,-7.173744,...,10.902540,9.678332e-01,9.678332e-01,0.967833,1.000000e+00,1,0.204903,0.000016,0.204903,2.435736
3,ens0605500_001,2CXO J201627.1-071025,304.113190,-7.173878,0.277780,0.151145,125.300003,4216310437470181248,304.110923,-7.171276,...,-304.620697,0.000000e+00,0.000000e+00,0.967833,0.000000e+00,0,0.204903,0.002944,0.204924,60.407944
4,ens0605500_001,2CXO J201633.9-071145,304.141296,-7.195972,0.118465,0.118465,0.000000,4216298411563165440,304.141333,-7.195977,...,12.596580,9.993281e-01,9.993281e-01,0.999328,1.000000e+00,1,0.118465,0.002207,0.118486,1.099742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2402335,ens0000100_002,2CXO J175954.8-273333,269.978699,-27.559330,1.870930,1.870930,0.000000,4062801400103154944,269.978350,-27.558183,...,9.255383,3.594411e-02,3.594411e-02,0.137177,2.345124e-01,2,1.870930,0.000376,1.870930,2.286497
2402336,ens0000100_002,2CXO J175954.8-273333,269.978699,-27.559330,1.870930,1.870930,0.000000,4062801400103156096,269.974710,-27.559871,...,0.140434,2.861378e-11,2.861378e-11,0.137177,1.799765e-10,0,1.870930,0.000200,1.870930,6.882566
2402337,ens0000100_002,2CXO J175954.8-273333,269.978699,-27.559330,1.870930,1.870930,0.000000,4062801400103156608,269.979370,-27.555323,...,-2.813277,3.183203e-14,3.183203e-14,0.137177,2.002187e-13,0,1.870930,0.000169,1.870930,7.795647
2402338,ens0000100_002,2CXO J175954.8-273333,269.978699,-27.559330,1.870930,1.870930,0.000000,4062801400103157760,269.975990,-27.557002,...,1.419599,5.441776e-10,5.441776e-10,0.137177,3.422796e-09,0,1.870930,0.002001,1.870931,6.436801


In [17]:
# read gaia additional properties

gaia_add_props = read_gzipped_votable_to_dataframe('../../data/v3/additional_gaia_properties-result.vot.gz')

In [31]:
# change type of id
p_last_prob_with_class['gaia3_source_id'] = p_last_prob_with_class['gaia3_source_id'].astype(str)
p_second_most_probable_with_class['gaia3_source_id'] = p_second_most_probable_with_class['gaia3_source_id'].astype(str)
p_most_probable_with_class['gaia3_source_id'] = p_most_probable_with_class['gaia3_source_id'].astype(str)
gaia_add_props['gaia3_source_id'] = gaia_add_props['gaia3_source_id'].astype(str)

In [37]:
# format names
p_last_prob_with_class['gaia3_source_id'] = p_last_prob_with_class['gaia3_source_id'].str.strip()
p_second_most_probable_with_class['gaia3_source_id'] = p_second_most_probable_with_class['gaia3_source_id'].str.strip()
p_most_probable_with_class['gaia3_source_id'] = p_most_probable_with_class['gaia3_source_id'].str.strip()
gaia_add_props['gaia3_source_id'] = gaia_add_props['gaia3_source_id'].str.strip()

In [38]:
additional_columns_gaia = [
'gaia3_source_id', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error',
'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux',
'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error',
'radial_velocity', 'radial_velocity_error', 'vbroad', 'vbroad_error',
'phot_variable_flag', 'classprob_dsc_combmod_quasar', 'classprob_dsc_combmod_galaxy',
'classprob_dsc_combmod_star', 'distance_gspphot', 'distance_gspphot_lower',
'distance_gspphot_upper'
]

gaia_selected_props = gaia_add_props[additional_columns_gaia]
# merge the tables with the new Gaia data
prepared_most_probable_gaiaprops = pd.merge(p_most_probable_with_class, gaia_selected_props, on='gaia3_source_id', how='left')
prepared_second_prob_gaiaprops = pd.merge(p_second_most_probable_with_class, gaia_selected_props, on='gaia3_source_id', how='left')
prepared_last_prob_gaiaprops = pd.merge(p_last_prob_with_class, gaia_selected_props, on='gaia3_source_id', how='left')


In [51]:
# save the merged datasets with the new gaia properties
prepared_most_probable_gaiaprops.to_csv('../../out_data/v3/most_prob_class_gaia_props.csv', index=False)
prepared_second_prob_gaiaprops.to_csv('../../out_data/v3/second_most_prob_class_gaia_props.csv', index=False)
prepared_last_prob_gaiaprops.to_csv('../../out_data/v3/last_prob_class_gaia_props.csv', index=False)

In [67]:
# all stack properties for each master source table

all_stack_df = read_votable_to_dataframe('../../data/v3/all_stacks.vot')

In [None]:
# find the rows with the minimum theta_mean for each name

min_theta_rows = all_stack_df.loc[all_stack_df.groupby('name')['theta_mean'].idxmin()]

In [75]:
# Reindex min_theta_rows to match df_csc_all
min_theta_rows_reindexed = min_theta_rows.set_index('name').reindex(df_csc_all['name']).reset_index()

# Compare the theta_mean in reindexed min_theta_rows with min_theta_mean in df_csc_all
comparison_result = min_theta_rows_reindexed['theta_mean'].equals(df_csc_all['min_theta_mean'])

# Display the comparison result
print('Comparison Result:', comparison_result)

Comparison Result: True


In [108]:
# Create a new column for the maximum flux significance across all bands
all_stack_df['max_flux_significance'] = all_stack_df[['flux_significance_b', 'flux_significance_u', 'flux_significance_s', 
                                      'flux_significance_m', 'flux_significance_h', 'flux_significance_w']].max(axis=1)

#idx_max_flux = all_stack_df.groupby('name')['max_flux_significance'].idxmax()

# Function to get the row with max flux significance or the only row if single
def get_max_flux_row(group):
    if len(group) == 1:
        return group.iloc[0]
    else:
        return group.loc[group['max_flux_significance'].idxmax()]

# Group by 'name' and apply the function to get the desired rows
max_flux_rows = all_stack_df.groupby('name').apply(get_max_flux_row).reset_index(drop=True)

# it turns out that the CSC main table takes the significance property from the max flux significance in each band, but considering all associated stack observations (not only uniquely associated)