In [72]:
import sys
import os
sys.path.append("../../") 

import pandas as pd
from astropy.table import Table
from src.data import (
    read_votable_to_dataframe,
    convert_byte_columns_to_str,
    filter_multiple_matches,
    get_most_probable_matches,
    get_second_most_probable_matches,
    get_last_probable_matches,
    prepare_final_dataset,
    include_classifications
    )

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
# load Dong-Woo nway data
t_nway = Table.read('../../data/v3/nway_CSC21_GAIA3.fits', format='fits')
t_nway = convert_byte_columns_to_str(t_nway)
df_nway_all = t_nway.to_pandas()

In [35]:
# before this, download the whole CSC21 with properties from CSCView.
df_csc_all = read_votable_to_dataframe('../../data/v3/csc_all.vot')

In [38]:
# filter nway crossmatches to only have those with >1 possible matches
df_filtered = filter_multiple_matches(df_nway_all, 'CSC21_CSCID')

In [42]:
# get the table of most probable matches based on p_i
df_most_probable = get_most_probable_matches(df_filtered, 'CSC21_CSCID', 'p_i')

In [47]:
# get second most probable matches
df_second_most_probable = get_second_most_probable_matches(df_filtered, 'CSC21_CSCID', 'p_i')

In [53]:
# get last probable matches
df_last_probable = get_last_probable_matches(df_filtered, 'CSC21_CSCID', 'p_i')

In [106]:
# prepare final datasets
prepared_most_probable = prepare_final_dataset(df_most_probable, df_csc_all)
prepared_second_prob = prepare_final_dataset(df_second_most_probable, df_csc_all)
prepared_last_prob = prepare_final_dataset(df_last_probable, df_csc_all)

In [70]:
# read classification tables
yangetal_gcs = pd.read_csv('../../data/v3/yangetal_gcs.csv')
yangetal_training = pd.read_csv('../../data/v3/yangetal_training.csv')
perezdiazetal_class = pd.read_csv('../../data/v3/uniquely_classified.csv')

  yangetal_gcs = pd.read_csv('../../data/v3/yangetal_gcs.csv')


In [131]:
# include classifications and export
p_most_probable_with_class = include_classifications(
    prepared_most_probable,
    yangetal_gcs,
    base_col='csc21_name',
    additional_col='CSCv2_name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_gcs_class'}
)

p_most_probable_with_class = include_classifications(
    p_most_probable_with_class,
    yangetal_training,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_training_class'}
)

p_most_probable_with_class = include_classifications(
    p_most_probable_with_class,
    perezdiazetal_class,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['agg_master_class'],
    rename_columns={'agg_master_class': 'perezdiazetal_class'}
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)


In [133]:
# include classifications and export
p_second_most_probable_with_class = include_classifications(
    prepared_second_prob,
    yangetal_gcs,
    base_col='csc21_name',
    additional_col='CSCv2_name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_gcs_class'}
)

p_second_most_probable_with_class = include_classifications(
    p_second_most_probable_with_class,
    yangetal_training,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_training_class'}
)

p_second_most_probable_with_class = include_classifications(
    p_second_most_probable_with_class,
    perezdiazetal_class,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['agg_master_class'],
    rename_columns={'agg_master_class': 'perezdiazetal_class'}
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)


In [134]:
# include classifications and export
p_last_prob_with_class = include_classifications(
    prepared_last_prob,
    yangetal_gcs,
    base_col='csc21_name',
    additional_col='CSCv2_name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_gcs_class'}
)

p_last_prob_with_class = include_classifications(
    p_last_prob_with_class,
    yangetal_training,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_training_class'}
)

p_last_prob_with_class = include_classifications(
    p_last_prob_with_class,
    perezdiazetal_class,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['agg_master_class'],
    rename_columns={'agg_master_class': 'perezdiazetal_class'}
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)


In [135]:
p_most_probable_with_class.to_csv('../../out_data/v3/most_prob_class.csv', index=False)
p_second_most_probable_with_class.to_csv('../../out_data/v3/second_most_prob_class.csv', index=False)
p_last_prob_with_class.to_csv('../../out_data/v3/last_prob_class.csv', index=False)

# end

In [137]:
p_most_probable_with_class.columns

Index(['csc21_name', 'csc21_ra', 'csc21_dec', 'gaia3_source_id', 'gaia3_ra',
       'gaia3_dec', 'phot_g_mean_mag', 'phot_bp_mean_mag', 'phot_rp_mean_mag',
       'bp_rp', 'bp_g', 'g_rp', 'parallax', 'parallax_over_error', 'hard_hs',
       'hard_hm', 'hard_ms', 'var_intra_prob_b', 'var_inter_prob_b',
       'separation', 'dist_bayesfactor', 'dist_post', 'p_single', 'p_any',
       'p_i', 'match_flag', 'yangetal_gcs_class', 'yangetal_training_class',
       'perezdiazetal_class'],
      dtype='object')