In [86]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from astropy.io.votable import parse
import numpy as np
import math
from pylab import cm
from statsmodels.stats import inter_rater as irr
import itertools

In [2]:
# Edit the font, font size, and axes width

mpl.rcParams['font.family'] = 'Avenir LT Std'
plt.rcParams['font.size'] = 14
plt.rcParams['axes.linewidth'] = 2

In [3]:
def votable_to_pandas(votable_file):
    '''
    Converts votable to pandas dataframe.
    '''
    votable = parse(votable_file)
    table = votable.get_first_table().to_table(use_names_over_ids=True)
    return table.to_pandas()

In [4]:
data = votable_to_pandas("../../data/cscresults.vot")

In [5]:
cl_0 = pd.read_csv('./class_data/cl0.csv', index_col=0)
cl_1 = pd.read_csv('./class_data/cl1.csv', index_col=0)
cl_2 = pd.read_csv('./class_data/cl2.csv', index_col=0)
cl_3 = pd.read_csv('./class_data/cl3.csv', index_col=0)
cl_4 = pd.read_csv('./class_data/cl4.csv', index_col=0)
cl_5 = pd.read_csv('./class_data/cl5.csv', index_col=0)

In [6]:
cl_all = pd.concat([cl_0, cl_1, cl_2, cl_4, cl_5])
cl_all_with_coords = cl_all.merge(data[['name','obsid', 'ra', 'dec']], how='left', on=['name', 'obsid'])

In [7]:
cl_all_with_coords.columns

Index(['name', 'obsid', 'hard_hm', 'hard_hs', 'hard_ms', 'powlaw_gamma',
       'bb_kt', 'var_prob_b', 'var_ratio_b', 'var_prob_h', 'var_ratio_h',
       'var_prob_s', 'var_ratio_s', 'var_newq_b', 'main_type', 'Orion_V*',
       'PartofG', 'QSO', 'TTau*', 'YSO', 'AGN', 'HMXB', 'Candidate_XB*',
       'Seyfert_1', 'Seyfert_2', 'ra', 'dec'],
      dtype='object')

In [8]:
print(f'There are {len(np.unique(cl_all_with_coords.name))} unique classified sources...')

There are 8159 unique classified sources...


In [9]:
cl_all_with_coords.name.value_counts().reset_index(name="count").query("count > 1")

Unnamed: 0,index,count
0,2CXO J004231.1+411621,97
1,2CXO J004248.5+411521,93
2,2CXO J004254.9+411603,84
3,2CXO J004232.0+411314,80
4,2CXO J004213.1+411836,71
...,...,...
1833,2CXO J203341.8-472134,2
1834,2CXO J123513.0+620805,2
1835,2CXO J171220.8-382930,2
1836,2CXO J214402.3+193640,2


#### Validation

In [83]:
classes_names = cl_all_with_coords.iloc[:,cl_all_with_coords.columns.get_loc('main_type')+1:-2].columns
cl_all_with_coords.fillna(0, inplace=True)
summ_table = cl_all_with_coords.groupby('name')[classes_names].agg(['mean', 'std'])
summ_table['count'] = cl_all_with_coords.groupby(['name']).size()
class_mean_names = [list(tup) for tup in itertools.product(classes_names, ['mean'], repeat=1)]
names_comp = summ_table[class_mean_names].idxmax(axis=1).to_list()
master_names = [name[0] for name in names_comp]
summ_table['master_names'] = master_names

In [22]:
src = cl_all_with_coords.sample(1).name.to_numpy()[0]
tgt_obs = cl_all_with_coords[cl_all_with_coords.name == src]

In [105]:
src

'2CXO J132528.3-430103'

In [152]:
dumm = tgt_obs[['obsid', 'main_type']].set_index(keys='obsid').T.to_numpy()
dumm

array([['YSO', 'YSO', 'Orion_V*', 'YSO', 'Orion_V*', 'YSO', 'YSO',
        'HMXB', 'HMXB', 'HMXB', 'HMXB', 'HMXB', 'HMXB', 'HMXB', 'HMXB',
        'HMXB', 'Candidate_XB*', 'HMXB', 'HMXB', 'HMXB', 'HMXB', 'HMXB',
        'HMXB', 'HMXB', 'HMXB', 'HMXB', 'HMXB', 'HMXB', 'HMXB', 'HMXB',
        'HMXB', 'HMXB', 'HMXB', 'HMXB', 'Candidate_XB*', 'HMXB', 'HMXB',
        'HMXB', 'HMXB', 'HMXB', 'HMXB', 'HMXB', 'HMXB', 'YSO',
        'Orion_V*', 'YSO', 'Orion_V*', 'Orion_V*', 'YSO', 'YSO', 'YSO',
        'Orion_V*', 'Orion_V*', 'QSO', 'Orion_V*', 'Orion_V*',
        'Orion_V*', 'YSO', 'YSO', 'YSO', 'YSO', 'Orion_V*', 'Orion_V*',
        'YSO', 'YSO', 'YSO', 'YSO', 'YSO', 'YSO', 'YSO', 'YSO']],
      dtype=object)

In [153]:
tgt_obs.iloc[:,tgt_obs.columns.get_loc('main_type')+1:-2].fillna(0).agg(["mean", "std"])

Unnamed: 0,Orion_V*,PartofG,QSO,TTau*,YSO,AGN,HMXB,Candidate_XB*,Seyfert_1,Seyfert_2
mean,0.182072,0.000268,0.032304,0.041132,0.194534,0.056741,0.321112,0.162627,0.001809,0.007401
std,0.200769,0.001065,0.043154,0.073531,0.208976,0.066304,0.244883,0.169806,0.005531,0.017272


In [22]:
agg = irr.aggregate_raters(dumm.reshape(1, -1)) 
agg

(array([[1, 1]]), array(['AGN', 'Orion_V*'], dtype=object))

In [59]:
probas_max = tgt_obs[agg[1][agg[0].argmax()]].fillna(0).to_numpy()

In [60]:
probas_total = tgt_obs.iloc[:, tgt_obs.columns.get_loc('main_type')+1:-2].max(axis=1).to_numpy()

In [61]:
print(f'Percentage agreement for {agg[1][agg[0].argmax()]}: {probas_max.sum()/probas_total.sum()}')

Percentage agreement for HMXB: 0.6412723770184106


In [62]:
probas_max

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.46809887, 0.52754473, 0.53956492,
       0.6379339 , 0.5228447 , 0.53030967, 0.55860279, 0.44780936,
       0.44972682, 0.27493971, 0.62065196, 0.60588266, 0.53813163,
       0.47844666, 0.66690838, 0.65515185, 0.68996855, 0.45060537,
       0.41995281, 0.61332726, 0.50471443, 0.65218447, 0.53617372,
       0.55924847, 0.50915305, 0.7209123 , 0.57745551, 0.21360719,
       0.47519842, 0.5222971 , 0.49770597, 0.5645759 , 0.56745597,
       0.68845456, 0.59200137, 0.64771991, 0.0848039 , 0.1254849 ,
       0.13214607, 0.04482221, 0.01343461, 0.14897557, 0.15224248,
       0.05972696, 0.06075616, 0.01518061, 0.23617327, 0.02956854,
       0.06254534, 0.06772062, 0.07980839, 0.06746078, 0.19306795,
       0.10014492, 0.07793334, 0.13762346, 0.22920104, 0.18613604,
       0.06727513, 0.19603791, 0.33486263, 0.13581414, 0.12993032,
       0.1048156 ])

In [178]:
if len(agg[1]) < 2:
    print(f'Percentage agreement for {agg[1][agg[0].argmax()]}: {1}')
else:
    print(f'Percentage agreement for {agg[1][agg[0].argmax()]}: {agg[0].max()/agg[0].sum() - agg[0].min()/agg[0].sum()}')

Percentage agreement for HMXB: 0.6625000000000001


#### Plots

In [None]:
def plots_per_detection(query, multi=True):
    colors = cm.get_cmap('tab10', 4)
    target_sources = cl_all_with_coords.name.value_counts().reset_index(name="count").query(query)
    src_list = target_sources['index'].to_list()
    
    for src in src_list:
        tgt_obs = cl_all_with_coords[cl_all_with_coords.name == src]
        
        nrow = math.ceil(tgt_obs.shape[0]/3) ; ncol = 3;
        fig, axs = plt.subplots(nrows=nrow, ncols=ncol, figsize=(16,3*nrow))
        fig.tight_layout(h_pad=8, w_pad=2)
        
        for i, ax in enumerate(axs.reshape(-1)): 
            
            if i >= tgt_obs.shape[0]:
                ax.set_axis_off()
                continue

            ax.yaxis.set_tick_params(which='major', size=6, width=0.5, direction='in')
            ax.yaxis.set_tick_params(which='minor', size=3, width=0.5, direction='in')
            ax.xaxis.set_tick_params(which='major', size=6, width=0.5, direction='out')
            ax.xaxis.set_tick_params(which='minor', size=3, width=0.5, direction='out')
            ax.yaxis.set_major_locator(mpl.ticker.MultipleLocator(0.25))
            ax.yaxis.set_minor_locator(mpl.ticker.MultipleLocator(0.125))
            
            row = tgt_obs.iloc[i]
            probas = row[tgt_obs.columns.get_loc('main_type')+1:tgt_obs.columns.get_loc('ra')]
            classes = row.index[tgt_obs.columns.get_loc('main_type')+1:tgt_obs.columns.get_loc('ra')]
            
            p_in = [x for x in probas.index if ~np.isnan(probas[x])]
            p_val = [x for x in probas.values if ~np.isnan(x)]
            ax.plot(p_in, p_val, marker='o', color=colors(0), lw=4, ms=10, label='CSC')
            if probas.max() > 0.5:
                ax.set_ylim(0, 1)
            if probas.max() <= 0.5:
                ax.set_ylim(0, 0.5)
            
            ax.set_title(row['name']+', '+str(row['obsid']) + ', ' + str(row['main_type']), fontsize=12)
            
            plt.setp(ax.xaxis.get_majorticklabels(), rotation=90)
        print('Printing detections for source {}...'.format(row['name']))
        plt.savefig('plots_per_detection/{}.pdf'.format(row['name']), dpi=300, transparent=False, bbox_inches='tight')

In [None]:
plots_per_detection("index == '2CXO J033827.6-352648'");