# Pipeline for High-z Radio Galaxies 12: Create lists (files) with radio AGN candidates

## Introduction

In this file, three models will be applied consecutively in order to predict  
the detection of Radio Galaxies (radio AGN) and their redshift.  

In principle, this pipeline should be applied to data in Stripe 82. But  
it can be used with any other suitable dataset.

In [1]:
%matplotlib inline
# Static plots
#%matplotlib ipympl
# Interactive plots
import numpy as np
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib import ticker
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patheffects as mpe
import matplotlib.patches as mpatches
from matplotlib.ticker import ScalarFormatter
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from astropy.visualization import LogStretch, PowerStretch
from astropy.visualization.mpl_normalize import ImageNormalize
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import ConfusionMatrixDisplay
import sklearn.pipeline
import colorcet as cc
from pycaret import classification as pyc
from pycaret import regression as pyr
from joblib import dump, load
import pandas as pd
import mpl_scatter_density
import global_variables as gv
import global_functions as gf

In [2]:
# mpl.rcdefaults()

#### Methods to predict values using individual models

Predict AGN/Galaxy classification

In [3]:
def predict_AGN_gal(catalog_df, 
                    AGN_gal_model, 
                    cal_AGN_gal_model, 
                    threshold, 
                    cal_threshold, 
                    raw_score=True):
    catalog_df = pyc.predict_model(AGN_gal_model, 
                                   data=catalog_df, 
                                   probability_threshold=threshold, 
                                   raw_score=raw_score, 
                                   round=10)
    catalog_df = catalog_df.drop(columns=['Score_0'])
    catalog_df = catalog_df.rename(columns={'Label': 'pred_class', 'Score_1': 'Score_AGN'})
    catalog_df.loc[:, 'Score_AGN'] = np.around(catalog_df.loc[:, 'Score_AGN'], decimals=8)
    pred_probs = cal_AGN_gal_model.predict(catalog_df.loc[:, 'Score_AGN'])
    cal_class  = np.array(pred_probs >= cal_threshold).astype(int)
    catalog_df['Prob_AGN']       = pred_probs
    catalog_df['pred_class_cal'] = cal_class
    return catalog_df

Predict radio detection for AGN

In [4]:
def predict_radio_det(catalog_df, 
                      radio_model, 
                      cal_radio_model, 
                      threshold, 
                      cal_threshold, 
                      raw_score=True):
    catalog_df = pyc.predict_model(radio_model, 
                                   data=catalog_df, 
                                   probability_threshold=threshold, 
                                   raw_score=raw_score, 
                                   round=10)
    catalog_df = catalog_df.drop(columns=['Score_0'])
    catalog_df = catalog_df.rename(columns={'Label': 'pred_radio', 'Score_1': 'Score_radio'})
    catalog_df.loc[:, 'Score_radio'] = np.around(catalog_df.loc[:, 'Score_radio'], decimals=8)
    pred_probs = cal_radio_model.predict(catalog_df.loc[:, 'Score_radio'])
    cal_class  = np.array(pred_probs >= cal_threshold).astype(int)
    catalog_df['Prob_radio']     = pred_probs
    catalog_df['pred_radio_cal'] = cal_class
    return catalog_df

Predict redshift for radio-detected AGN

In [5]:
def predict_z(catalog_df, 
              redshift_model):
    catalog_df = pyr.predict_model(redshift_model, 
                                   data=catalog_df, 
                                   round=10)
    catalog_df = catalog_df.rename(columns={'Label': 'pred_Z'})
    catalog_df.loc[:, 'pred_Z'] = np.around(catalog_df.loc[:, 'pred_Z'], decimals=4)
    return catalog_df

---

## Reading data

Flags.

In [6]:
save_plot_flag      = False
save_preds_flag     = False
load_models_flag    = True
predict_only_hi_z   = False
use_zeroth_model    = False
use_second_z_model  = False  # z >= 3.6 (with SMOGN), or, if needed, z >= 4.0

In [7]:
used_area           = 'S82'  # can be 'S82', 'HETDEX', 'COSMOS'

In [8]:
file_name_dict      = {'S82': gv.file_S82, 'HETDEX': gv.file_HETDEX, 'COSMOS': gv.file_COSMOS}
file_name           = file_name_dict[used_area]

In [9]:
feats_2_disc_S82    = ['objID', 'RA_MILLI', 'DEC_MILLI', 'W1mag', 'W2mag', 'num_imputed', 'radio_detect']
feats_2_disc_HETDEX = ['objID', 'RA_MILLI', 'DEC_MILLI', 'W1mag', 'W2mag', 'num_imputed', 'radio_detect']
feats_2_disc_COSMOS = ['objID', 'RA_MILLI', 'DEC_MILLI', 'W1mag', 'W2mag', 'num_imputed', 'radio_detect', ]

feats_2_disc        = {'S82': feats_2_disc_S82, 'HETDEX': feats_2_disc_HETDEX, 'COSMOS': feats_2_disc_COSMOS}
features_2_discard  = feats_2_disc[used_area]

In [10]:
full_catalog_df     = pd.read_hdf(gv.cat_path + file_name, key='df').drop(columns=features_2_discard)

In [11]:
if used_area == 'S82':
    full_catalog_df.loc[:, 'LOFAR_detect'] = full_catalog_df.loc[:, 'VLAS82_detect'].copy()
    full_catalog_df = full_catalog_df.drop(columns=['VLAS82_detect'])
if used_area == 'COSMOS':
    full_catalog_df.loc[:, 'LOFAR_detect'] = full_catalog_df.loc[:, 'COSMOSVLA3_detect'].copy()
    full_catalog_df = full_catalog_df.drop(columns=['COSMOSVLA3_detect'])

Create features with class and combined redshift.

In [12]:
full_catalog_df['class']            = full_catalog_df.loc[:, 'is_AGN'].copy()
filter_non_confirmed                = np.array(full_catalog_df.loc[:, 'is_AGN'] == 1) | np.array(full_catalog_df.loc[:, 'is_gal'] == 1)
full_catalog_df.loc[~filter_non_confirmed, 'class'] = np.nan
idx_non_Z                           = full_catalog_df.loc[:, 'Z'].where(full_catalog_df.loc[:, 'Z'] > 0).isna()
full_catalog_df.loc[idx_non_Z, 'Z'] = full_catalog_df.loc[:, 'Z'].mask(idx_non_Z, full_catalog_df.loc[idx_non_Z, 'zsp'])

Create column for detection as Radio AGN

In [13]:
full_catalog_df['radio_AGN']      = (np.array(full_catalog_df.loc[:, 'is_AGN'] == 1) &\
                                     np.array(full_catalog_df.loc[:, 'LOFAR_detect'] == 1)).astype(int)

Discard minor features.

In [14]:
# full_catalog_df                     = full_catalog_df.drop(columns=['is_AGN', 'is_SDSS_QSO', 'is_SDSS_gal', 'is_gal', 'zsp'])

#### As we want to predict, only use sources that have not previous spectroscopic classification

In [15]:
print(f'The used data set (in {used_area}) has {len(full_catalog_df):,} sources.')
print(f'And {np.sum(filter_non_confirmed):,} have previous spectroscopic classification.')

The used data set (in S82) has 3,590,306 sources.
And 21,828 have previous spectroscopic classification.


In [16]:
# full_catalog_df                     = full_catalog_df.loc[~filter_non_confirmed]

In [17]:
print(f'This pipeline will predict properties for {len(full_catalog_df):,} sources in {used_area}.')

This pipeline will predict properties for 3,590,306 sources in S82.


---

### Load models

In [18]:
if load_models_flag:
    if use_zeroth_model:
        star_clf          = pyc.load_model(gv.models_path + gv.star_model)  # star/no-star model
        cal_star_clf      = load(gv.models_path + gv.cal_str_model)  # calibrated model
    AGN_gal_clf           = pyc.load_model(gv.models_path + gv.AGN_gal_model)  #
    cal_AGN_gal_clf       = load(gv.models_path + gv.cal_AGN_gal_model)  # calibrated model
    radio_det_AGN_clf     = pyc.load_model(gv.models_path + gv.radio_model)  # without predicted AGN
    cal_radio_det_AGN_clf = load(gv.models_path + gv.cal_radio_model)  # calibrated model
    redshift_reg_rAGN     = pyr.load_model(gv.models_path + gv.full_z_model)  # to use on full sample
    if use_second_z_model:
        redshift_reg_2    = pyr.load_model(gv.models_path + gv.high_z_model)  # sources with predicted z >= 3.6

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


Run predictions

In [19]:
full_catalog_df = predict_AGN_gal(full_catalog_df, 
                                  AGN_gal_clf, 
                                  cal_AGN_gal_clf, 
                                  gv.AGN_thresh, 
                                  gv.cal_AGN_thresh)

In [20]:
full_catalog_df = predict_radio_det(full_catalog_df, 
                                    radio_det_AGN_clf, 
                                    cal_radio_det_AGN_clf, 
                                    gv.radio_thresh, 
                                    gv.cal_radio_thresh)

In [21]:
full_catalog_df = full_catalog_df.rename(columns={'Score_radio': 'Score_radio_AGN', 
                                                            'pred_radio': 'pred_radio_AGN',
                                                            'Prob_radio': 'Prob_radio_AGN', 
                                                            'pred_radio_cal': 'pred_radio_cal_AGN'})

In [22]:
full_catalog_df = predict_z(full_catalog_df, redshift_reg_rAGN)

In [23]:
full_catalog_df = full_catalog_df.rename(columns={'pred_Z': 'pred_Z_rAGN'})

In [24]:
np.array(full_catalog_df.columns)

array(['RA_ICRS', 'DE_ICRS', 'Name', 'TYPE', 'Z', 'zsp', 'spCl',
       'band_num', 'Fint_VLAS82_non_imp', 'Fint_VLAS82_AB_non_imp',
       'is_str', 'is_SDSS_QSO', 'is_AGN', 'is_SDSS_gal', 'is_gal',
       'W1mproPM', 'W2mproPM', 'gmag', 'rmag', 'imag', 'zmag', 'ymag',
       'W3mag', 'W4mag', 'Jmag', 'Hmag', 'Kmag', 'Fint_VLAS82_AB',
       'Fint_VLAS82', 'g_r', 'g_i', 'g_z', 'g_y', 'g_J', 'g_H', 'g_K',
       'g_W1', 'g_W2', 'g_W3', 'g_W4', 'r_i', 'r_z', 'r_y', 'r_J', 'r_H',
       'r_K', 'r_W1', 'r_W2', 'r_W3', 'r_W4', 'i_z', 'i_y', 'i_J', 'i_H',
       'i_K', 'i_W1', 'i_W2', 'i_W3', 'i_W4', 'z_y', 'z_J', 'z_H', 'z_K',
       'z_W1', 'z_W2', 'z_W3', 'z_W4', 'y_J', 'y_H', 'y_K', 'y_W1',
       'y_W2', 'y_W3', 'y_W4', 'J_H', 'J_K', 'J_W1', 'J_W2', 'J_W3',
       'J_W4', 'H_K', 'H_W1', 'H_W2', 'H_W3', 'H_W4', 'K_W1', 'K_W2',
       'K_W3', 'K_W4', 'W1_W2', 'W1_W3', 'W1_W4', 'W2_W3', 'W2_W4',
       'W3_W4', 'LOFAR_detect', 'class', 'radio_AGN', 'pred_class',
       'Score_AGN', 'Prob_

In [25]:
# full_catalog_df = pyc.predict_model(AGN_gal_clf, data=full_catalog_df, probability_threshold=gv.AGN_thresh, raw_score=True, round=10)
# full_catalog_df = full_catalog_df.drop(columns=['Score_0'])
# full_catalog_df = full_catalog_df.rename(columns={'Label': 'pred_class', 'Score_1': 'Score_AGN'})
# full_catalog_df['Score_AGN'] = np.around(full_catalog_df.loc[:, 'Score_AGN'], decimals=7)
# pred_probs_AGN  = cal_AGN_gal_clf.predict(full_catalog_df.loc[:, 'Score_AGN'])
# full_catalog_df['Prob_AGN'] = pred_probs_AGN

In [26]:
# full_catalog_df = pyc.predict_model(radio_det_clf, data=full_catalog_df, probability_threshold=gv.radio_thresh, raw_score=True, round=10)
# full_catalog_df = full_catalog_df.drop(columns=['Score_0'])
# full_catalog_df = full_catalog_df.rename(columns={'Label': 'pred_radio', 'Score_1': 'Score_radio'})
# full_catalog_df['Score_radio'] = np.around(full_catalog_df.loc[:, 'Score_radio'], decimals=7)
# pred_probs_rad  = cal_radio_det_clf.predict(full_catalog_df.loc[:, 'Score_radio'])
# full_catalog_df['Prob_radio'] = pred_probs_rad

In [27]:
# full_catalog_df = pyr.predict_model(redshift_reg, data=full_catalog_df, round=8)
# full_catalog_df = full_catalog_df.rename(columns={'Label': 'pred_Z'})
# full_catalog_df['pred_Z'] = np.around(full_catalog_df.loc[:, 'pred_Z'], decimals=3)

In [28]:
# redshift_tol    = 0.0
# if use_second_z_model:
#     full_catalog_df = pyr.predict_model(redshift_reg_2, data=full_catalog_df, round=6)
#     filter_pred_z   = full_catalog_df.loc[:, 'pred_Z'] >= (gv.high_z_limit + redshift_tol)
#     full_catalog_df.loc[:, 'pred_Z'] = full_catalog_df.loc[:, 'pred_Z'].mask(filter_pred_z, full_catalog_df.loc[filter_pred_z, 'Label'])
#     full_catalog_df = full_catalog_df.drop(columns=['Label'])
#     full_catalog_df.loc[:, 'pred_Z'] = np.around(full_catalog_df.loc[:, 'pred_Z'], decimals=3)

In [29]:
# full_catalog_df['pred_prob_class']    = (full_catalog_df.loc[:, 'Prob_AGN']   >= gv.cal_AGN_thresh).astype(int)
# full_catalog_df['pred_prob_radio']    = (full_catalog_df.loc[:, 'Prob_radio'] >= gv.cal_radio_thresh).astype(int)

In [30]:
full_catalog_df['pred_radio_AGN']     = (np.array(full_catalog_df.loc[:, 'pred_class'] == 1) & np.array(full_catalog_df.loc[:, 'pred_radio_AGN'] == 1)).astype(int)
full_catalog_df['Score_rAGN']         = full_catalog_df.loc[:, 'Score_AGN'] * full_catalog_df.loc[:, 'Score_radio_AGN']
full_catalog_df['pred_prob_rAGN']     = (np.array(full_catalog_df.loc[:, 'pred_class_cal'] == 1) & np.array(full_catalog_df.loc[:, 'pred_radio_cal_AGN'] == 1)).astype(int)
full_catalog_df['Prob_rAGN']          = full_catalog_df.loc[:, 'Prob_AGN'] * full_catalog_df.loc[:, 'Prob_radio_AGN']

# rad_score_scaler                      = MinMaxScaler()
# full_catalog_df['scaled_score_radio'] = rad_score_scaler.fit_transform(full_catalog_df.loc[:, 'Score_radio'].values.reshape(-1, 1))
# full_catalog_df['scaled_score_rAGN']  = full_catalog_df.loc[:, 'Score_AGN'] * full_catalog_df.loc[:, 'scaled_score_radio']

In [31]:
full_catalog_df.loc[:, ['class', 'pred_class_cal', 'LOFAR_detect', 'pred_radio_cal_AGN', 'Z', 'pred_Z_rAGN']].describe()

Unnamed: 0,class,pred_class_cal,LOFAR_detect,pred_radio_cal_AGN,Z,pred_Z_rAGN
count,21828.0,3590306.0,3590306.0,3590306.0,23130.0,3590306.0
mean,0.812855,0.5825788,0.002436283,0.1362499,1.305753,1.976987
std,0.390037,0.4931337,0.04929856,0.3430538,0.833161,0.6022475
min,0.0,0.0,0.0,0.0,-0.00768,0.0166
25%,1.0,0.0,0.0,0.0,0.701,1.7775
50%,1.0,1.0,0.0,0.0,1.203,2.1288
75%,1.0,1.0,0.0,0.0,1.827,2.1288
max,1.0,1.0,1.0,1.0,7.01124,4.7394


Obtain intermediate metrics

In [32]:
filter_radio_AGN_t      = np.array(full_catalog_df.loc[:, 'class'] == 1) & np.array(full_catalog_df.loc[:, 'LOFAR_detect'] == 1)
filter_gal_AGN_t        = np.array(full_catalog_df.loc[:, 'class'] == 0) | np.array(full_catalog_df.loc[:, 'class'] == 1)
total_size              = len(full_catalog_df)
filter_AGN_t            = np.array(full_catalog_df.loc[:, 'class'] == 1)
num_AGN_t               = np.sum(filter_AGN_t)
num_gal_t               = np.sum(np.array(full_catalog_df.loc[:, 'class'] == 0))
num_radio_t             = np.sum(np.array(full_catalog_df.loc[:, 'LOFAR_detect'] == 1))
num_radio_AGN_t         = np.sum(filter_radio_AGN_t)

In [33]:
# filter_radio_AGN_p      = np.array(full_catalog_df.loc[:, 'Score_AGN']          >= threshold_AGN) &\
#                             np.array(full_catalog_df.loc[:, 'Score_radio']      >= threshold_radio)
filter_radio_AGN_p      = np.array(full_catalog_df.loc[:, 'pred_class_cal']      == 1) &\
                            np.array(full_catalog_df.loc[:, 'pred_radio_cal_AGN']    == 1)
filt_hiz_rAGN_p         = filter_radio_AGN_p * np.array(full_catalog_df.loc[:, 'pred_Z_rAGN'] >= gv.high_z_limit)
filter_AGN_p            = np.array(full_catalog_df.loc[:, 'pred_class_cal']      == 1)
filter_radio_p          = np.array(full_catalog_df.loc[:, 'pred_radio_cal_AGN']      == 1)
num_AGN_p               = np.sum(filter_AGN_p)
num_gal_p               = np.sum(np.array(full_catalog_df.loc[:, 'pred_class_cal'] == 0))
num_radio_p             = np.sum(filter_radio_p)
num_radio_AGN_p         = np.sum(filter_radio_AGN_p)

### Select sources predicted to be Radio AGN (optional)

In [34]:
# full_catalog_df         = full_catalog_df.loc[filter_radio_AGN_p]

Add individual metrics for redshift

In [35]:
full_catalog_df['Delta_z_N'] = np.around((full_catalog_df.loc[:, 'pred_Z_rAGN'] - full_catalog_df.loc[:, 'Z']) /\
                            (1 + full_catalog_df.loc[:, 'Z']), decimals=3)

full_catalog_df['sigma_NMAD'] = np.around(1.48 * np.abs(full_catalog_df.loc[:, 'pred_Z_rAGN'] - full_catalog_df.loc[:, 'Z']) /\
                            (1 + full_catalog_df.loc[:, 'Z']), decimals=3)

Numerical summary.

In [36]:
str_0_t   = f'Out of {total_size:,} initial sources in {used_area},\n'
str_1_t   = f'{num_gal_t:,} are confirmed to be galaxies. On the other side,\n'
str_2_t   = f'{num_AGN_t:,} are confirmed to be AGN. And, from the AGN,\n'
str_3_t   = f'{num_radio_AGN_t:,} are detected in radio.'

str_0_p   = f'Out of {num_radio_AGN_t:,} initial radio-detected AGN in {used_area},\n'
str_1_p   = f'{num_gal_p:,} are predicted to be galaxies. On the other side,\n'
str_2_p   = f'{num_AGN_p:,} are predicted to be AGN. And, from the predicted AGN,\n'
str_3_p   = f'{num_radio_AGN_p:,} are predicted to be detected in radio.'

In [37]:
print('-' * 60)
print(str_0_t + str_1_t + str_2_t + str_3_t)
print('-' * 60)
print(str_0_p + str_1_p + str_2_p + str_3_p)
print('-' * 60)

------------------------------------------------------------
Out of 3,590,306 initial sources in S82,
4,085 are confirmed to be galaxies. On the other side,
17,743 are confirmed to be AGN. And, from the AGN,
815 are detected in radio.
------------------------------------------------------------
Out of 815 initial radio-detected AGN in S82,
1,498,670 are predicted to be galaxies. On the other side,
2,091,636 are predicted to be AGN. And, from the predicted AGN,
25,854 are predicted to be detected in radio.
------------------------------------------------------------


In [38]:
cols_4_table = show_columns = ['Name', 'RA_ICRS', 'DE_ICRS', 'TYPE', 'band_num', 'class', 'pred_class',
                               'pred_class_cal', 'Score_AGN', 'Prob_AGN', 'LOFAR_detect', 'pred_radio_AGN',
                               'pred_radio_cal_AGN', 'Score_radio_AGN', 'Prob_radio_AGN', 'radio_AGN', 'pred_radio_AGN',
                               'pred_prob_rAGN', 'Score_rAGN', 'Prob_rAGN', 'Z', 'pred_Z_rAGN']

In [39]:
cols_4_export_S82    = ['Fint_VLAS82']
cols_4_export_HETDEX = ['Sint_LOFAR']
cols_4_export_COSMOS = ['Flux_COSMOSVLA3']

cols_4_exp_all       = {'S82': cols_4_export_S82, 'HETDEX': cols_4_export_HETDEX, 'COSMOS': cols_4_export_COSMOS}

cols_photo           = ['W1mproPM', 'W2mproPM', 'gmag', 'rmag', 'imag', 'zmag', 
                        'ymag', 'Jmag', 'Hmag', 'Kmag', 'W3mag', 'W4mag']

cols_4_export        = cols_4_table + cols_4_exp_all[used_area] + cols_photo

In [40]:
filter_new_rAGN = full_catalog_df.loc[:, 'radio_AGN'] == 0
if predict_only_hi_z:
    filter_high_z   = full_catalog_df.loc[:, 'pred_Z_rAGN']    >= gv.high_z_limit
elif not predict_only_hi_z:
    filter_high_z   = np.ones_like(full_catalog_df.loc[:, 'pred_Z_rAGN']).astype(bool)

In [41]:
full_catalog_df.loc[:, 'TYPE'] = full_catalog_df.loc[:, 'TYPE'].mask(full_catalog_df.loc[:, 'TYPE'] == '    ', 'CCCC')

In [42]:
display(full_catalog_df.loc[filter_high_z, cols_4_export].sort_values(by=['pred_Z_rAGN'], ascending=False).head(15))

Unnamed: 0,Name,RA_ICRS,DE_ICRS,TYPE,band_num,class,pred_class,pred_class_cal,Score_AGN,Prob_AGN,LOFAR_detect,pred_radio_AGN,pred_radio_cal_AGN,Score_radio_AGN,Prob_radio_AGN,radio_AGN,pred_radio_AGN.1,pred_prob_rAGN,Score_rAGN,Prob_rAGN,Z,pred_Z_rAGN,Fint_VLAS82,W1mproPM,W2mproPM,gmag,rmag,imag,zmag,ymag,Jmag,Hmag,Kmag,W3mag,W4mag
281082,J005421.41-010921.3,13.589242,-1.15594,Q,6,1.0,0,0,0.49995,0.148285,0,0,0,0.056025,0.07064,0,0,0,0.02801,0.010475,5.022,4.7394,0.260615,19.458,19.662001,23.299999,21.498501,20.056499,19.7404,19.5984,17.450001,17.24,16.59,16.67,14.62
1497075,J222509.18-001406.7,336.2883,-0.235215,Q,9,1.0,1,1,0.500049,0.861232,0,0,0,0.081257,0.099143,0,0,0,0.040632,0.085385,4.85,4.6378,0.260615,18.577,18.726999,21.8883,20.4571,19.103001,18.820101,18.7813,17.450001,17.24,16.59,16.67,14.62
2876093,J231615.98+004914.4,349.06662,0.820679,,7,,0,0,0.499947,0.134204,0,0,1,0.207238,0.224558,0,0,0,0.103608,0.030137,,4.5946,0.260615,19.477999,19.809999,23.299999,20.851801,20.6376,20.3792,21.4,17.450001,17.24,16.59,16.67,14.62
1418308,J014331.64-001745.6,25.881863,-0.296041,,5,,0,0,0.499939,0.102828,0,0,1,0.258198,0.269367,0,0,0,0.129083,0.027698,,4.5774,0.260615,19.563999,19.809999,23.299999,21.165701,20.8876,20.606001,21.4,17.450001,17.24,16.59,16.67,14.62
2667473,J013556.69+003942.6,23.986246,0.661852,,7,,0,0,0.499973,0.282964,0,0,0,0.163937,0.184103,0,0,0,0.081964,0.052095,,4.5773,0.260615,19.505999,19.809999,23.299999,21.2749,20.9897,20.736099,21.4,17.450001,17.24,16.59,16.67,14.62
3311969,J013502.21+010925.3,23.759235,1.157051,,7,,0,0,0.499896,0.023826,0,0,0,0.130683,0.151287,0,0,0,0.065328,0.003605,,4.5701,0.260615,19.417,19.809999,23.299999,21.094101,20.891399,20.6464,21.4,17.450001,17.24,16.59,16.67,14.62
73383,J014921.68-012024.6,27.340359,-1.340193,,7,,0,0,0.499979,0.330922,0,0,0,0.084418,0.102615,0,0,0,0.042207,0.033958,,4.5683,0.260615,19.549999,19.809999,23.299999,21.260401,20.9844,20.7451,21.4,17.450001,17.24,16.59,16.67,14.62
1870141,J015313.97+000308.5,28.30822,0.052361,,7,,0,0,0.499893,0.02093,0,0,1,0.188143,0.207012,0,0,0,0.094051,0.004333,,4.5681,0.260615,19.425999,19.809999,23.299999,21.152,20.950199,20.7162,21.4,17.450001,17.24,16.59,16.67,14.62
2023799,J220618.89+001010.8,331.578735,0.169662,,5,,0,0,0.499884,0.015045,0,0,0,0.162584,0.182799,0,0,0,0.081273,0.00275,,4.5602,0.260615,19.404999,19.780001,23.299999,20.8347,20.6057,20.379999,21.4,17.450001,17.24,16.59,16.67,14.62
248624,J014331.42-011052.3,25.880953,-1.181239,,5,,0,0,0.499938,0.100698,0,0,1,0.374364,0.362668,0,0,0,0.187159,0.03652,,4.5597,0.260615,19.557999,19.809999,23.299999,21.221399,20.933599,20.6341,21.4,17.450001,17.24,16.59,16.67,14.62


In [43]:
if save_preds_flag:
    full_catalog_df.loc[filter_high_z, cols_4_export].sort_values(by=['pred_Z_rAGN'],
     ascending=False).to_csv(gv.preds_path + f'predicted_rAGN_{used_area}.csv', index_label='ID')

---

In [44]:
temp_flag = False

In [45]:
if temp_flag:
    cols_AGN   = ['g_r', 'r_i', 'r_J', 'i_z', 'i_y', 
                  'z_y', 'z_W2', 'y_J', 'y_W1', 'y_W2', 'J_H', 'H_K', 
                  'H_W3', 'W1_W2', 'W1_W3', 'W3_W4']  # Only colours (no 'band_num', 'W4mag')
    cols_radio = ['g_r', 'g_i', 'r_i', 'r_z', 'i_z', 
                  'z_y', 'z_W1', 'y_J', 'y_W1', 'J_H', 'H_K', 'K_W3', 
                  'K_W4', 'W1_W2', 'W2_W3']  # Only colours (no 'band_num', 'W4mag')
    cols_z     = ['g_r', 'g_W3', 'r_i', 'r_z', 'i_z', 
                  'i_y', 'z_y', 'y_J', 'y_W1', 'J_H', 'H_K', 'K_W3', 
                  'K_W4', 'W1_W2', 'W2_W3']  # Only colours (no 'band_num', 'W4mag')
    
    cols_cols = list(np.unique(cols_AGN + cols_radio + cols_z))

In [46]:
if temp_flag:
    
    add_columns  = ['band_num', 'class', 'pred_class_cal', 'Score_AGN', 'Prob_AGN', 
                    'LOFAR_detect', 'pred_radio_cal_AGN', 'Score_radio_AGN', 'Prob_radio_AGN', 
                    'radio_AGN', 'pred_prob_rAGN', 'Score_rAGN', 'Prob_rAGN', 'Z', 'pred_Z_rAGN']
    used_colours = cols_cols

In [47]:
if temp_flag:
    saving_data = full_catalog_df.loc[:, add_columns + cols_photo + used_colours]
    saving_data['ID'] = saving_data.index
    file_name         = gv.preds_path + f'{used_area}_for_prediction.h5'
    file_name_pqt     = gv.preds_path + f'{used_area}_for_prediction.parquet'
    saving_data.to_hdf(file_name, key='df')
    saving_data.to_parquet(file_name_pqt, index=True, engine='fastparquet')
    print(f'Data from {used_area} saved in files {file_name} and {file_name_pqt}')

In [48]:
save_full_flag = True

In [49]:
np.array(full_catalog_df.columns)

array(['RA_ICRS', 'DE_ICRS', 'Name', 'TYPE', 'Z', 'zsp', 'spCl',
       'band_num', 'Fint_VLAS82_non_imp', 'Fint_VLAS82_AB_non_imp',
       'is_str', 'is_SDSS_QSO', 'is_AGN', 'is_SDSS_gal', 'is_gal',
       'W1mproPM', 'W2mproPM', 'gmag', 'rmag', 'imag', 'zmag', 'ymag',
       'W3mag', 'W4mag', 'Jmag', 'Hmag', 'Kmag', 'Fint_VLAS82_AB',
       'Fint_VLAS82', 'g_r', 'g_i', 'g_z', 'g_y', 'g_J', 'g_H', 'g_K',
       'g_W1', 'g_W2', 'g_W3', 'g_W4', 'r_i', 'r_z', 'r_y', 'r_J', 'r_H',
       'r_K', 'r_W1', 'r_W2', 'r_W3', 'r_W4', 'i_z', 'i_y', 'i_J', 'i_H',
       'i_K', 'i_W1', 'i_W2', 'i_W3', 'i_W4', 'z_y', 'z_J', 'z_H', 'z_K',
       'z_W1', 'z_W2', 'z_W3', 'z_W4', 'y_J', 'y_H', 'y_K', 'y_W1',
       'y_W2', 'y_W3', 'y_W4', 'J_H', 'J_K', 'J_W1', 'J_W2', 'J_W3',
       'J_W4', 'H_K', 'H_W1', 'H_W2', 'H_W3', 'H_W4', 'K_W1', 'K_W2',
       'K_W3', 'K_W4', 'W1_W2', 'W1_W3', 'W1_W4', 'W2_W3', 'W2_W4',
       'W3_W4', 'LOFAR_detect', 'class', 'radio_AGN', 'pred_class',
       'Score_AGN', 'Prob_

In [52]:
if save_full_flag:
    if used_area == 'S82':
        cols_2_save = ['RA_ICRS', 'DE_ICRS', 'Name', 'TYPE', 'Z', 'band_num', 'Fint_VLAS82', 
                       'Fint_VLAS82_AB', 'Fint_VLAS82_non_imp', 'is_str', 'W1mproPM', 
                       'W2mproPM', 'gmag', 'rmag', 'imag', 'zmag', 'ymag', 'W3mag', 'W4mag', 'Jmag', 
                       'Hmag', 'Kmag', 'g_r', 'g_i', 'g_z', 'g_y', 'g_J', 'g_H', 'g_K', 'g_W1', 
                       'g_W2', 'g_W3', 'g_W4', 'r_i', 'r_z', 'r_y', 'r_J', 'r_H', 'r_K', 'r_W1', 
                       'r_W2', 'r_W3', 'r_W4', 'i_z', 'i_y', 'i_J', 'i_H', 'i_K', 'i_W1', 'i_W2', 
                       'i_W3', 'i_W4', 'z_y', 'z_J', 'z_H', 'z_K', 'z_W1', 'z_W2', 'z_W3', 'z_W4', 
                       'y_J', 'y_H', 'y_K', 'y_W1', 'y_W2', 'y_W3', 'y_W4', 'J_H', 'J_K', 'J_W1', 
                       'J_W2', 'J_W3', 'J_W4', 'H_K', 'H_W1', 'H_W2', 'H_W3', 'H_W4', 'K_W1', 
                       'K_W2', 'K_W3', 'K_W4', 'W1_W2', 'W1_W3', 'W1_W4', 'W2_W3', 'W2_W4', 'W3_W4', 
                       'LOFAR_detect', 'class', 'radio_AGN', 'Prob_AGN', 'Prob_radio_AGN', 
                       'pred_Z_rAGN', 'pred_class_cal', 'pred_radio_cal_AGN', 'pred_prob_rAGN', 'Prob_rAGN']  # 'rms_VLAS82'
    if used_area == 'HETDEX':
        cols_2_save = ['RA_ICRS', 'DE_ICRS', 'Name', 'TYPE', 'Z', 'band_num', 'Sint_LOFAR', 
                       'Sint_LOFAR_AB', 'Sint_LOFAR_non_imp', 'rms_LOFAR', 'Speak_LOFAR', 
                       'Speak_LOFAR_non_imp', 'is_str', 'W1mproPM', 'W2mproPM', 'gmag', 'rmag', 
                       'imag', 'zmag', 'ymag', 'W3mag', 'W4mag', 'Jmag', 'Hmag', 'Kmag', 'g_r', 
                       'g_i', 'g_z', 'g_y', 'g_J', 'g_H', 'g_K', 'g_W1', 'g_W2', 'g_W3', 'g_W4', 
                       'r_i', 'r_z', 'r_y', 'r_J', 'r_H', 'r_K', 'r_W1', 'r_W2', 'r_W3', 'r_W4', 
                       'i_z','i_y', 'i_J', 'i_H', 'i_K', 'i_W1', 'i_W2', 'i_W3', 'i_W4', 'z_y', 
                       'z_J', 'z_H', 'z_K', 'z_W1', 'z_W2', 'z_W3', 'z_W4', 'y_J', 'y_H', 'y_K', 
                       'y_W1', 'y_W2', 'y_W3', 'y_W4', 'J_H', 'J_K', 'J_W1', 'J_W2', 'J_W3', 'J_W4', 
                       'H_K', 'H_W1', 'H_W2', 'H_W3', 'H_W4', 'K_W1', 'K_W2', 'K_W3', 'K_W4', 
                       'W1_W2', 'W1_W3', 'W1_W4', 'W2_W3', 'W2_W4', 'W3_W4', 'LOFAR_detect', 
                       'class', 'radio_AGN', 'Prob_AGN', 'Prob_radio_AGN', 'pred_Z_rAGN', 
                       'pred_class_cal', 'pred_radio_cal_AGN', 'pred_prob_rAGN', 'Prob_rAGN']

In [53]:
if save_full_flag:
    saving_data_full       = full_catalog_df.loc[:, cols_2_save]
    saving_data_full['ID'] = saving_data_full.index
    try:
        saving_data_full.to_hdf(gv.preds_path + f'{used_area}_full_prediction.h5', key='df')
        print(f'File {gv.preds_path}{used_area}_full_prediction.h5 saved')
    except:
        print(f'File {gv.preds_path}{used_area}_full_prediction.h5 was not saved')
    try:
        saving_data_full.to_parquet(gv.preds_path + f'{used_area}_full_prediction.parquet', key='df')
        print(f'File {gv.preds_path}{used_area}_full_prediction.parquet saved')
    except:
        print(f'File {gv.preds_path}{used_area}_full_prediction.parquet was not saved')

File pred_rAGN/S82_full_prediction.h5 saved
File pred_rAGN/S82_full_prediction.parquet was not saved
