# Pipeline for High-z Radio Galaxies 12: Create lists (files) with radio AGN candidates

## Introduction

In this file, three models will be applied consecutively in order to predict  
the detection of Radio Galaxies (radio AGN) and their redshift.  

In principle, this pipeline should be applied to data in Stripe 82. But  
it can be used with any other suitable dataset.

In [1]:
%matplotlib inline
# Static plots
#%matplotlib ipympl
# Interactive plots
import numpy as np
import matplotlib as mpl
import matplotlib.cm as cm
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import ConfusionMatrixDisplay
import sklearn.pipeline
from pycaret import classification as pyc
from pycaret import regression as pyr
from joblib import dump, load
import pandas as pd
import global_variables as gv
import global_functions as gf

---

## Reading data

Flags.

In [2]:
save_plot_flag      = False
save_preds_flag     = False
load_models_flag    = True
predict_only_hi_z   = False
use_zeroth_model    = False
use_second_z_model  = False  # z >= 3.6 (with SMOGN), or, if needed, z >= 4.0

In [3]:
used_area           = 'HETDEX'  # can be 'S82', 'HETDEX', 'COSMOS'

In [4]:
file_name_dict      = {'S82': gv.file_S82, 'HETDEX': gv.file_HETDEX, 'COSMOS': gv.file_COSMOS}
file_name           = file_name_dict[used_area]

In [5]:
feats_2_disc_S82    = ['objID', 'RA_MILLI', 'DEC_MILLI', 'W1mag', 'W2mag', 'num_imputed']
feats_2_disc_HETDEX = ['objID', 'RA_MILLI', 'DEC_MILLI', 'W1mag', 'W2mag', 'num_imputed']
feats_2_disc_COSMOS = ['objID', 'RA_MILLI', 'DEC_MILLI', 'W1mag', 'W2mag', 'num_imputed']

feats_2_disc        = {'S82': feats_2_disc_S82, 'HETDEX': feats_2_disc_HETDEX, 'COSMOS': feats_2_disc_COSMOS}
features_2_discard  = feats_2_disc[used_area]

In [6]:
full_catalog_df     = pd.read_hdf(gv.cat_path + file_name, key='df').drop(columns=features_2_discard)

In [7]:
if used_area == 'S82':
    full_catalog_df.loc[:, 'LOFAR_detect'] = full_catalog_df.loc[:, 'VLAS82_detect'].copy()
    full_catalog_df = full_catalog_df.drop(columns=['VLAS82_detect'])
if used_area == 'COSMOS':
    full_catalog_df.loc[:, 'LOFAR_detect'] = full_catalog_df.loc[:, 'COSMOSVLA3_detect'].copy()
    full_catalog_df = full_catalog_df.drop(columns=['COSMOSVLA3_detect'])

Create features with class and combined redshift.

In [8]:
full_catalog_df['class']            = full_catalog_df.loc[:, 'is_AGN'].copy()
filter_non_confirmed                = np.array(full_catalog_df.loc[:, 'is_AGN'] == 1) | np.array(full_catalog_df.loc[:, 'is_gal'] == 1)
full_catalog_df.loc[~filter_non_confirmed, 'class'] = np.nan
idx_non_Z                           = full_catalog_df.loc[:, 'Z'].where(full_catalog_df.loc[:, 'Z'] > 0).isna()
full_catalog_df.loc[idx_non_Z, 'Z'] = full_catalog_df.loc[:, 'Z'].mask(idx_non_Z, full_catalog_df.loc[idx_non_Z, 'zsp'])

Create column for detection as Radio AGN

In [9]:
full_catalog_df['radio_AGN']      = (np.array(full_catalog_df.loc[:, 'is_AGN'] == 1) &\
                                     np.array(full_catalog_df.loc[:, 'LOFAR_detect'] == 1)).astype(int)

Discard minor features.

In [10]:
# full_catalog_df                     = full_catalog_df.drop(columns=['is_AGN', 'is_SDSS_QSO', 'is_SDSS_gal', 'is_gal', 'zsp'])

#### As we want to predict, only use sources that have not previous spectroscopic classification

In [11]:
print(f'The used data set (in {used_area}) has {len(full_catalog_df):,} sources.')
print(f'And {np.sum(filter_non_confirmed):,} have previous spectroscopic classification.')

The used data set (in HETDEX) has 15,136,878 sources.
And 118,734 have previous spectroscopic classification.


In [12]:
# full_catalog_df                     = full_catalog_df.loc[~filter_non_confirmed]

In [13]:
print(f'This pipeline will predict properties for {len(full_catalog_df):,} sources in {used_area}.')

This pipeline will predict properties for 15,136,878 sources in HETDEX.


---

### Load models

In [14]:
if load_models_flag:
    if use_zeroth_model:
        star_clf          = pyc.load_model(gv.models_path + gv.star_model)  # star/no-star model
        cal_star_clf      = load(gv.models_path + gv.cal_str_model)  # calibrated model
    AGN_SFG_clf           = pyc.load_model(gv.models_path + gv.AGN_gal_model)  #
    cal_AGN_SFG_clf       = load(gv.models_path + gv.cal_AGN_gal_model)  # calibrated model
    
    radio_det_AGN_clf     = pyc.load_model(gv.models_path + gv.radio_model)  # without predicted AGN
    radio_det_SFG_clf     = pyc.load_model(gv.models_path + gv.radio_galaxies_model)  # Radio detection for galaxies
    cal_radio_det_AGN_clf = load(gv.models_path + gv.cal_radio_model)  # calibrated model
    cal_radio_det_SFG_clf = load(gv.models_path + gv.cal_radio_gals_model)  # calibrated model radio detection for galaxies
    
    redshift_reg_rAGN     = pyr.load_model(gv.models_path + gv.full_z_model)  # to use on full sample
    redshift_reg_rSFG     = pyr.load_model(gv.models_path + gv.z_radio_galaxies_model)  # Redshift prediction for radio-galaxies

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


Run predictions

In [15]:
full_cols = list(full_catalog_df.columns.values)

In [16]:
model_cols = ['band_num', 'W4mag', 'g_r', 'r_i', 'r_J', 'i_z', 'i_y', 'z_y', 'z_W2', 'y_J',
              'y_W1', 'y_W2', 'J_H', 'H_K', 'H_W3', 'W1_W2', 'W1_W3', 'W3_W4', 'g_i', 'r_z',
              'z_W1', 'K_W3', 'K_W4', 'W2_W3', 'Kmag', 'g_W2', 'r_y', 'g_W1']

In [17]:
non_model_cols = [col_name for col_name in full_cols if col_name not in model_cols]

In [18]:
preds_cat_df = full_catalog_df.loc[:, model_cols]

In [19]:
preds_cat_df = gf.predict_AGN_gal(preds_cat_df, AGN_SFG_clf,  cal_AGN_SFG_clf,
                                      gv.AGN_thresh, gv.cal_AGN_thresh,
                                     raw_score=True, cols_out=['pred_class', 'Score_AGN', 'Prob_AGN', 'pred_class_cal'])

In [20]:
preds_cat_df = gf.predict_radio_det(preds_cat_df, radio_det_AGN_clf, cal_radio_det_AGN_clf,
                                       gv.radio_thresh, gv.cal_radio_thresh, raw_score=True,
                                       cols_out=['pred_radio_AGN', 'Score_radio_AGN', 'Prob_radio_AGN', 'pred_radio_AGN_cal'])

In [21]:
preds_cat_df = gf.predict_radio_det(preds_cat_df, radio_det_SFG_clf, cal_radio_det_SFG_clf,
                                        gv.radio_gals_thresh, gv.cal_radio_gals_thresh, raw_score=True,
                                       cols_out=['pred_radio_SFG', 'Score_radio_SFG', 'Prob_radio_SFG', 'pred_radio_SFG_cal'])

In [22]:
preds_cat_df = gf.predict_z(preds_cat_df, redshift_reg_rAGN, cols_out=['pred_Z_rAGN'])

In [23]:
preds_cat_df = gf.predict_z(preds_cat_df, redshift_reg_rSFG, cols_out=['pred_Z_rSFG'])

In [24]:
pred_cols = ['Prob_AGN', 'pred_class_cal', 'Prob_radio_AGN', 'pred_radio_AGN_cal',
             'Prob_radio_SFG', 'pred_radio_SFG_cal', 'pred_Z_rAGN', 'pred_Z_rSFG']

In [25]:
full_catalog_df[pred_cols] = preds_cat_df.loc[:, pred_cols]

In [27]:
np.array(full_catalog_df.columns)

array(['RA_ICRS', 'DE_ICRS', 'Name', 'TYPE', 'Z', 'zsp', 'spCl',
       'band_num', 'radio_detect', 'LOFAR_detect', 'Sint_LOFAR',
       'Sint_LOFAR_AB', 'Speak_LOFAR', 'rms_LOFAR', 'Sint_LOFAR_non_imp',
       'Sint_LOFAR_AB_non_imp', 'Speak_LOFAR_non_imp', 'is_str',
       'is_SDSS_QSO', 'is_AGN', 'is_SDSS_gal', 'is_gal', 'W1mproPM',
       'W2mproPM', 'gmag', 'rmag', 'imag', 'zmag', 'ymag', 'W3mag',
       'W4mag', 'Jmag', 'Hmag', 'Kmag', 'g_r', 'g_i', 'g_z', 'g_y', 'g_J',
       'g_H', 'g_K', 'g_W1', 'g_W2', 'g_W3', 'g_W4', 'r_i', 'r_z', 'r_y',
       'r_J', 'r_H', 'r_K', 'r_W1', 'r_W2', 'r_W3', 'r_W4', 'i_z', 'i_y',
       'i_J', 'i_H', 'i_K', 'i_W1', 'i_W2', 'i_W3', 'i_W4', 'z_y', 'z_J',
       'z_H', 'z_K', 'z_W1', 'z_W2', 'z_W3', 'z_W4', 'y_J', 'y_H', 'y_K',
       'y_W1', 'y_W2', 'y_W3', 'y_W4', 'J_H', 'J_K', 'J_W1', 'J_W2',
       'J_W3', 'J_W4', 'H_K', 'H_W1', 'H_W2', 'H_W3', 'H_W4', 'K_W1',
       'K_W2', 'K_W3', 'K_W4', 'W1_W2', 'W1_W3', 'W1_W4', 'W2_W3',
       'W2_W4', '

In [29]:
full_catalog_df.loc[:, ['Prob_AGN', 'pred_class_cal']].describe()

Unnamed: 0,Prob_AGN,pred_class_cal
count,15136880.0,15136880.0
mean,0.4486523,0.6624084
std,0.2746302,0.4728885
min,0.01133026,0.0
25%,0.1181614,0.0
50%,0.6181595,1.0
75%,0.6181595,1.0
max,0.989372,1.0


In [None]:
full_catalog_df['pred_radio_AGN']     = (np.array(full_catalog_df.loc[:, 'pred_class'] == 1) & np.array(full_catalog_df.loc[:, 'pred_radio_AGN'] == 1)).astype(int)
full_catalog_df['Score_rAGN']         = full_catalog_df.loc[:, 'Score_AGN'] * full_catalog_df.loc[:, 'Score_radio_AGN']
full_catalog_df['pred_prob_rAGN']     = (np.array(full_catalog_df.loc[:, 'pred_class_cal'] == 1) & np.array(full_catalog_df.loc[:, 'pred_radio_cal_AGN'] == 1)).astype(int)
full_catalog_df['Prob_rAGN']          = full_catalog_df.loc[:, 'Prob_AGN'] * full_catalog_df.loc[:, 'Prob_radio_AGN']

full_catalog_df['pred_radio_gal']     = (np.array(full_catalog_df.loc[:, 'pred_class'] == 0) & np.array(full_catalog_df.loc[:, 'pred_radio_gal'] == 1)).astype(int)
full_catalog_df['Score_rGal']         = (1 - full_catalog_df.loc[:, 'Score_AGN']) * full_catalog_df.loc[:, 'Score_radio_gal']
full_catalog_df['pred_prob_rGal']     = (np.array(full_catalog_df.loc[:, 'pred_class_cal'] == 0) & np.array(full_catalog_df.loc[:, 'pred_radio_cal_gal'] == 1)).astype(int)
full_catalog_df['Prob_rGal']          = (1 - full_catalog_df.loc[:, 'Prob_AGN']) * full_catalog_df.loc[:, 'Prob_radio_gal']

In [None]:
# rad_score_scaler                      = MinMaxScaler()
# full_catalog_df['scaled_score_radio'] = rad_score_scaler.fit_transform(full_catalog_df.loc[:, 'Score_radio'].values.reshape(-1, 1))
# full_catalog_df['scaled_score_rAGN']  = full_catalog_df.loc[:, 'Score_AGN'] * full_catalog_df.loc[:, 'scaled_score_radio']

In [31]:
full_catalog_df.loc[:, ['class', 'pred_class_cal', 'LOFAR_detect', 'pred_radio_AGN_cal', 'pred_radio_SFG_cal', 'Z', 'pred_Z_rAGN', 'pred_Z_rSFG']].describe()

Unnamed: 0,class,pred_class_cal,LOFAR_detect,pred_radio_AGN_cal,pred_radio_SFG_cal,Z,pred_Z_rAGN,pred_Z_rSFG
count,118734.0,15136880.0,15136880.0,15136880.0,15136880.0,134234.0,15136880.0,15136880.0
mean,0.425641,0.6624084,0.02526479,0.1046465,0.01816009,0.876576,2.077934,0.3836021
std,0.494442,0.4728885,0.1569283,0.3060974,0.1335301,0.77938,0.6822612,0.268731
min,0.0,0.0,0.0,0.0,0.0,-0.00449,0.0174,0.0079
25%,0.0,0.0,0.0,0.0,0.0,0.381018,1.922,0.1916
50%,0.0,1.0,0.0,0.0,0.0,0.6235,2.1864,0.1916
75%,1.0,1.0,0.0,0.0,0.0,1.282,2.1864,0.6263
max,1.0,1.0,1.0,1.0,1.0,7.02833,4.6988,1.5022


---

In [None]:
temp_flag = False

In [None]:
if temp_flag:
    cols_AGN   = ['g_r', 'r_i', 'r_J', 'i_z', 'i_y', 
                  'z_y', 'z_W2', 'y_J', 'y_W1', 'y_W2', 'J_H', 'H_K', 
                  'H_W3', 'W1_W2', 'W1_W3', 'W3_W4']  # Only colours (no 'band_num', 'W4mag')
    cols_radio = ['g_r', 'g_i', 'r_i', 'r_z', 'i_z', 
                  'z_y', 'z_W1', 'y_J', 'y_W1', 'J_H', 'H_K', 'K_W3', 
                  'K_W4', 'W1_W2', 'W2_W3']  # Only colours (no 'band_num', 'W4mag')
    cols_z     = ['g_r', 'g_W3', 'r_i', 'r_z', 'i_z', 
                  'i_y', 'z_y', 'y_J', 'y_W1', 'J_H', 'H_K', 'K_W3', 
                  'K_W4', 'W1_W2', 'W2_W3']  # Only colours (no 'band_num', 'W4mag')
    
    cols_cols = list(np.unique(cols_AGN + cols_radio + cols_z))

In [32]:
save_full_flag = True

In [33]:
np.array(full_catalog_df.columns)

array(['RA_ICRS', 'DE_ICRS', 'Name', 'TYPE', 'Z', 'zsp', 'spCl',
       'band_num', 'radio_detect', 'LOFAR_detect', 'Sint_LOFAR',
       'Sint_LOFAR_AB', 'Speak_LOFAR', 'rms_LOFAR', 'Sint_LOFAR_non_imp',
       'Sint_LOFAR_AB_non_imp', 'Speak_LOFAR_non_imp', 'is_str',
       'is_SDSS_QSO', 'is_AGN', 'is_SDSS_gal', 'is_gal', 'W1mproPM',
       'W2mproPM', 'gmag', 'rmag', 'imag', 'zmag', 'ymag', 'W3mag',
       'W4mag', 'Jmag', 'Hmag', 'Kmag', 'g_r', 'g_i', 'g_z', 'g_y', 'g_J',
       'g_H', 'g_K', 'g_W1', 'g_W2', 'g_W3', 'g_W4', 'r_i', 'r_z', 'r_y',
       'r_J', 'r_H', 'r_K', 'r_W1', 'r_W2', 'r_W3', 'r_W4', 'i_z', 'i_y',
       'i_J', 'i_H', 'i_K', 'i_W1', 'i_W2', 'i_W3', 'i_W4', 'z_y', 'z_J',
       'z_H', 'z_K', 'z_W1', 'z_W2', 'z_W3', 'z_W4', 'y_J', 'y_H', 'y_K',
       'y_W1', 'y_W2', 'y_W3', 'y_W4', 'J_H', 'J_K', 'J_W1', 'J_W2',
       'J_W3', 'J_W4', 'H_K', 'H_W1', 'H_W2', 'H_W3', 'H_W4', 'K_W1',
       'K_W2', 'K_W3', 'K_W4', 'W1_W2', 'W1_W3', 'W1_W4', 'W2_W3',
       'W2_W4', '

In [34]:
if save_full_flag:
    if used_area == 'S82':
        cols_2_save = ['RA_ICRS', 'DE_ICRS', 'Name', 'TYPE', 'Z', 'band_num', 'Fint_VLAS82', 
                       'Fint_VLAS82_AB', 'Fint_VLAS82_non_imp', 'rms_VLAS82', 'is_str', 'W1mproPM', 
                       'W2mproPM', 'gmag', 'rmag', 'imag', 'zmag', 'ymag', 'W3mag', 'W4mag', 'Jmag', 
                       'Hmag', 'Kmag', 'g_r', 'g_i', 'g_z', 'g_y', 'g_J', 'g_H', 'g_K', 'g_W1', 
                       'g_W2', 'g_W3', 'g_W4', 'r_i', 'r_z', 'r_y', 'r_J', 'r_H', 'r_K', 'r_W1', 
                       'r_W2', 'r_W3', 'r_W4', 'i_z', 'i_y', 'i_J', 'i_H', 'i_K', 'i_W1', 'i_W2', 
                       'i_W3', 'i_W4', 'z_y', 'z_J', 'z_H', 'z_K', 'z_W1', 'z_W2', 'z_W3', 'z_W4', 
                       'y_J', 'y_H', 'y_K', 'y_W1', 'y_W2', 'y_W3', 'y_W4', 'J_H', 'J_K', 'J_W1', 
                       'J_W2', 'J_W3', 'J_W4', 'H_K', 'H_W1', 'H_W2', 'H_W3', 'H_W4', 'K_W1', 
                       'K_W2', 'K_W3', 'K_W4', 'W1_W2', 'W1_W3', 'W1_W4', 'W2_W3', 'W2_W4', 
                       'W3_W4', 'radio_detect', 'LOFAR_detect', 'class', 'radio_AGN', 'Prob_AGN', 
                       'Prob_radio_AGN', 'Prob_radio_gal', 'pred_Z_rAGN', 'pred_Z_rGal', 
                       'pred_class_cal', 'pred_radio_cal_AGN', 'pred_radio_cal_gal', 
                       'pred_prob_rAGN', 'Prob_rAGN', 'pred_prob_rGal', 'Prob_rGal']
    if used_area == 'HETDEX':
        cols_2_save = ['RA_ICRS', 'DE_ICRS', 'Name', 'TYPE', 'Z', 'band_num', 'Sint_LOFAR',
                       'Sint_LOFAR_non_imp', 'rms_LOFAR', 'Speak_LOFAR',
                       'Speak_LOFAR_non_imp', 'W1mproPM', 'W2mproPM', 'gmag', 'rmag', 'imag',
                       'zmag', 'ymag', 'W3mag', 'W4mag', 'Jmag', 'Hmag', 'Kmag', 'g_r', 'g_i',
                       'g_z', 'g_y', 'g_J', 'g_H', 'g_K', 'g_W1', 'g_W2', 'g_W3', 'g_W4',
                       'r_i', 'r_z', 'r_y', 'r_J', 'r_H', 'r_K', 'r_W1', 'r_W2', 'r_W3', 'r_W4',
                       'i_z','i_y', 'i_J', 'i_H', 'i_K', 'i_W1', 'i_W2', 'i_W3', 'i_W4', 'z_y',
                       'z_J', 'z_H', 'z_K', 'z_W1', 'z_W2', 'z_W3', 'z_W4', 'y_J', 'y_H', 'y_K',
                       'y_W1', 'y_W2', 'y_W3', 'y_W4', 'J_H', 'J_K', 'J_W1', 'J_W2', 'J_W3',
                       'J_W4', 'H_K', 'H_W1', 'H_W2', 'H_W3', 'H_W4', 'K_W1', 'K_W2', 'K_W3',
                       'K_W4', 'W1_W2', 'W1_W3', 'W1_W4', 'W2_W3', 'W2_W4', 'W3_W4',
                       'radio_detect', 'LOFAR_detect', 'class', 'Prob_AGN', 'pred_class_cal',
                       'Prob_radio_AGN', 'pred_radio_AGN_cal', 'Prob_radio_SFG',
                       'pred_radio_SFG_cal', 'pred_Z_rAGN', 'pred_Z_rSFG']

In [None]:
# if save_full_flag:
#     saving_data_full       = full_catalog_df.loc[:, cols_2_save]
#     saving_data_full['ID'] = saving_data_full.index
#     saving_data_full.to_hdf(gv.preds_path + f'{used_area}_full_prediction.h5', key='df')
#     print(f'File {gv.preds_path}{used_area}_full_prediction.h5 saved')

In [35]:
if save_full_flag:
    saving_data_full       = full_catalog_df.loc[:, cols_2_save]
    saving_data_full['ID'] = saving_data_full.index
    try:
        saving_data_full.to_hdf(gv.preds_path + f'{used_area}_full_prediction.h5', key='df')
        print(f'File {gv.preds_path}{used_area}_full_prediction.h5 saved')
    except:
        print(f'File {gv.preds_path}{used_area}_full_prediction.h5 was not saved')
    try:
        saving_data_full.to_parquet(gv.preds_path + f'{used_area}_full_prediction.parquet', index=True, engine='fastparquet')
        print(f'File {gv.preds_path}{used_area}_full_prediction.parquet saved')
    except:
        print(f'File {gv.preds_path}{used_area}_full_prediction.parquet was not saved')

File pred_rAGN/HETDEX_full_prediction.h5 saved
File pred_rAGN/HETDEX_full_prediction.parquet saved


In [42]:
if used_area == 'HETDEX':
    saving_data_full['is_AGN'] = full_catalog_df.loc[:, 'is_AGN']
    saving_data_full['is_gal'] = full_catalog_df.loc[:, 'is_gal']
    filter_known_spec = (saving_data_full.loc[:, 'is_AGN'] == 1) | (saving_data_full.loc[:, 'is_gal'] == 1)
    unknown_cat_df    = saving_data_full.loc[~filter_known_spec]
    known_catalog_df  = saving_data_full.loc[filter_known_spec]
    train_val_df, train_df, validation_df, calibration_df, test_df = gf.split_set(known_catalog_df,
                                                                                  [0.2, 0.2, 0.5],
                                                                                  'is_AGN', use_calibration=True)
    for sample in [saving_data_full, unknown_cat_df, known_catalog_df, train_val_df, train_df, validation_df, calibration_df, test_df]:
        sample = sample.drop(columns=['is_AGN', 'is_gal'])
    
    print('Shape of used data in HETDEX')
    print('-' * 65)
    print(f'Full confirmed dataset size                           : {saving_data_full.shape}')
    print(f'Data for Modeling (Train, Validation, and Calibration): {train_val_df.shape}')
    print(f'Training data                                         : {train_df.shape}')
    print(f'Validation data                                       : {validation_df.shape}')
    print(f'Calibration data                                      : {calibration_df.shape}')
    print(f'Testing data                                          : {test_df.shape}')
    print('-' * 65)
    print()
    print(f'Using data from HETDEX')
    selected_dataset = {'Training': train_df, 'Test': test_df, 'Test_Val': train_val_df,
                        'Validation': validation_df, 'Calibration': calibration_df,
                        'Known': known_catalog_df, 'Unknown': unknown_cat_df}  # fix naming
    print('Printing all subsets within HETDEX')
    for key in selected_dataset:
        print(f'Saving {key} subset...')
        try:
            selected_dataset[key].to_hdf(gv.preds_path + f'{used_area}_{key}_prediction.h5', key='df')
            print(f'File {gv.preds_path}{used_area}_{key}_prediction.h5 saved')
        except:
            print(f'File {gv.preds_path}{used_area}_{key}_prediction.h5 was not saved')
        try:
            selected_dataset[key].to_parquet(gv.preds_path + f'{used_area}_{key}_prediction.parquet', index=True, engine='fastparquet')
            print(f'File {gv.preds_path}{used_area}_{key}_prediction.parquet saved')
        except:
            print(f'File {gv.preds_path}{used_area}_{key}_prediction.parquet was not saved')

Shape of used data in HETDEX
-----------------------------------------------------------------
Full confirmed dataset size                           : (15136878, 103)
Data for Modeling (Train, Validation, and Calibration): (94987, 103)
Training data                                         : (75989, 103)
Validation data                                       : (9499, 103)
Calibration data                                      : (9499, 103)
Testing data                                          : (23747, 103)
-----------------------------------------------------------------

Using data from HETDEX
Printing all subsets within HETDEX
Saving Training subset...
File pred_rAGN/HETDEX_Training_prediction.h5 saved
File pred_rAGN/HETDEX_Training_prediction.parquet saved
Saving Test subset...
File pred_rAGN/HETDEX_Test_prediction.h5 saved
File pred_rAGN/HETDEX_Test_prediction.parquet saved
Saving Test_Val subset...
File pred_rAGN/HETDEX_Test_Val_prediction.h5 saved
File pred_rAGN/HETDEX_Test_Val_pre