In [1]:
import sys
import os
sys.path.append("../../") 

import pandas as pd
from astropy.table import Table
from src.data import (
    read_votable_to_dataframe,
    convert_byte_columns_to_str,
    filter_multiple_matches,
    get_most_probable_matches,
    get_second_most_probable_matches,
    get_last_probable_matches,
    prepare_final_dataset,
    include_classifications,
    read_gzipped_votable_to_dataframe
    )

%load_ext autoreload
%autoreload 2

In [2]:
# load Dong-Woo nway data
t_nway = Table.read('../../data/v3/nway_CSC21_GAIA3.fits', format='fits')
t_nway = convert_byte_columns_to_str(t_nway)
df_nway_all = t_nway.to_pandas()

In [3]:
df_nway_all['CSC21_CSCID'] = df_nway_all['CSC21_CSCID'].str.replace('_', ' ')
df_nway_all['CSC21_CSCID'] = df_nway_all['CSC21_CSCID'].str.strip()

In [4]:
df_nway_names = pd.DataFrame(df_nway_all['CSC21_CSCID'].unique(), columns=['csc21_name'])

In [5]:
# read classification tables
yangetal_gcs = pd.read_csv('../../data/v3/yangetal_gcs.csv')
yangetal_training = pd.read_csv('../../data/v3/yangetal_training.csv')
perezdiazetal_class = pd.read_csv('../../data/v3/uniquely_classified.csv')

  yangetal_gcs = pd.read_csv('../../data/v3/yangetal_gcs.csv')


In [6]:
# include classifications and export
df_nway_all_class = include_classifications(
    df_nway_names,
    yangetal_gcs,
    base_col='csc21_name',
    additional_col='CSCv2_name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_gcs_class'}
)

df_nway_all_class = include_classifications(
    df_nway_all_class,
    yangetal_training,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_training_class'}
)

df_nway_all_class = include_classifications(
    df_nway_all_class,
    perezdiazetal_class,
    base_col='csc21_name',
    additional_col='name',
    additional_columns=['agg_master_class'],
    rename_columns={'agg_master_class': 'perezdiazetal_class'}
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)


In [7]:
# download the whole CSC21 with properties from CSCView.
df_csc_all = read_votable_to_dataframe('../../data/v3/csc_all_1.vot')

In [8]:
df_csc_nway_complement = df_csc_all.loc[~df_csc_all.name.isin(df_nway_names.csc21_name)][['name']]

In [9]:
# we check the sources that are NOT in Dong Woo's table and find their classifications.
df_cscs_nway_comp_class = include_classifications(
    df_csc_nway_complement,
    yangetal_gcs,
    base_col='name',
    additional_col='CSCv2_name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_gcs_class'}
)

df_cscs_nway_comp_class = include_classifications(
    df_cscs_nway_comp_class,
    yangetal_training,
    base_col='name',
    additional_col='name',
    additional_columns=['Class'],
    rename_columns={'Class': 'yangetal_training_class'}
)

df_cscs_nway_comp_class = include_classifications(
    df_cscs_nway_comp_class,
    perezdiazetal_class,
    base_col='name',
    additional_col='name',
    additional_columns=['agg_master_class'],
    rename_columns={'agg_master_class': 'perezdiazetal_class'}
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_additional_selected.rename(columns=rename_columns, inplace=True)


In [10]:
df_cscs_nway_comp_class.perezdiazetal_class.value_counts(dropna=False)

NaN        151775
AGN           668
XB            624
Seyfert       258
YSO           172
Name: perezdiazetal_class, dtype: int64

In [11]:
df_cscs_nway_comp_class.yangetal_gcs_class.value_counts(dropna=False)

NaN        132593
AGN         16417
NS           2188
LMXB         1813
YSO           424
CV             45
LM-STAR         9
HM-STAR         8
Name: yangetal_gcs_class, dtype: int64

In [12]:
df_cscs_nway_comp_class.shape

(153497, 4)

In [13]:
# now we are analyzing how many master sources have multiple unique stacks

count_stacks = read_votable_to_dataframe('../../data/v3/count_stacks_per_master_source.vot')
print('Number of sources with multiple stacks: ', count_stacks.query('count_stacks > 1').shape)

Number of sources with multiple stacks:  (49827, 2)


In [14]:
count_stacks

Unnamed: 0,name,count_stacks
0,2CXO J000000.0+004331,1
1,2CXO J000000.1+623124,1
2,2CXO J000000.2-501250,1
3,2CXO J000000.3+321702,1
4,2CXO J000000.5+321232,1
...,...,...
407801,2CXO J010236.6-720819,60
407802,2CXO J010325.2-720643,64
407803,2CXO J010206.5-714536,74
407804,2CXO J220840.7+454432,102


---

In [None]:
# count the trainining and test proportions
from collections import Counter

def class_proportions(y):
    counts = Counter(y)
    total = len(y)
    proportions = {key: value / total for key, value in counts.items()}
    return proportions

train_proportions = class_proportions(Y_train)
test_proportions = class_proportions(Y_test)

print("Training proportions:", train_proportions)
print("Test proportions:", test_proportions)

In [None]:
# compute the number of nans (percentage) for each feature
num_samples = X.shape[0]
nan_count_per_feature = np.isnan(X).sum(axis=0)

# Calculate the percentage for each feature
nan_percentage_per_feature = (nan_count_per_feature / num_samples) * 100

for i, percentage in enumerate(nan_percentage_per_feature):
    print(f"Percentage of NaN values in feature {feature_names[i]}: {percentage:.2f}%")