# Filtering and repackaging Paula's Data

In [1]:
from dask.dataframe import read_parquet as dd_read_parquet
from pandas import read_csv, concat
from numpy import unique

In [2]:
data_dir = '/home/shoaib/PSChallenge/'
paula_original_features_gband_csv = "/home/shoaib/PSChallenge/Original_Data/features_QSO_AGN_Blazar_ZTF_DR6_lcs_gband_all.csv"
paula_original_features_rband_csv = "/home/shoaib/PSChallenge/Original_Data/features_QSO_AGN_Blazar_ZTF_DR6_lcs_rband_all.csv"

paula_original_lightcurves_gband = "/home/shoaib/PSChallenge/Original_Data/QSO_AGN_Blazar_ZTF_DR6_lcs_gband.parquet"
paula_original_lightcurves_rband = "/home/shoaib/PSChallenge/Original_Data/QSO_AGN_Blazar_ZTF_DR6_lcs_rband.parquet"

panstarrs_pointsources = "/home/shoaib/PSChallenge/Original_Data/QSO_AGN_Blazar_sample_milliquas_roma_ps_score_pointsources.csv"

# These files do not exist, they are save paths for later on in the script
original_features_by_oid = "/home/shoaib/PSChallenge/original_features_by_oid.csv"
filtered_lightcurves_gband = "/home/shoaib/PSChallenge/filtered_lightcurves_gband.parquet"
filtered_lightcurves_rband = "/home/shoaib/PSChallenge/filtered_lightcurves_rband.parquet"
filtered_lightcurves = "/home/shoaib/PSChallenge/filtered_lightcurves.parquet"

In [3]:
features_g = read_csv(paula_original_features_gband_csv, index_col=0)
features_r = read_csv(paula_original_features_rband_csv, index_col=0)

In [4]:
features_junk_columns = ['meanra', 'meandec', 'timespan_good', 'T2020_sigma2']

features_g = features_g.drop(features_junk_columns, axis=1)
features_r = features_r.drop(features_junk_columns, axis=1)

In [5]:
num_unfiltered_features_g = len(features_g)
num_unfiltered_features_r = len(features_r)

print(f'There are {num_unfiltered_features_g:,} OIDs in g and {num_unfiltered_features_r:,} OIDs in r')

There are 53,300 OIDs in g and 60,596 OIDs in r


In [6]:
ztf_g = dd_read_parquet(paula_original_lightcurves_gband).set_index('ID')
ztf_r = dd_read_parquet(paula_original_lightcurves_rband).set_index('ID')

In [7]:
lc_relevant_columns = ['name', 'oid_alerce', 'mjd', 'mag', 'magerr', 'catflags', 'filtercode']
ztf_g = ztf_g[lc_relevant_columns]
ztf_r = ztf_r[lc_relevant_columns]

ztf_g = ztf_g.rename(columns={'filtercode': 'band'})
ztf_r = ztf_r.rename(columns={'filtercode': 'band'})

In [8]:
len_ztf_g_prefilter = len(ztf_g)
len_ztf_r_prefilter = len(ztf_r)

ztf_g = ztf_g[(ztf_g['catflags'] == 0) & (ztf_g['magerr'] < 1)]
ztf_r = ztf_r[(ztf_r['catflags'] == 0) & (ztf_r['magerr'] < 1)]

len_ztf_g_postfilter = len(ztf_g)
len_ztf_r_postfilter = len(ztf_r)

print(f'Number of individual observations filtered by catflags, magerr in g:\t\t{len_ztf_g_prefilter - len_ztf_g_postfilter:,}')
print(f'Number of individual observations filtered by catflags, magerr in r:\t\t{len_ztf_r_prefilter - len_ztf_r_postfilter:,}')

Number of individual observations filtered by catflags, magerr in g:		26,531
Number of individual observations filtered by catflags, magerr in r:		34,207


In [9]:
ztf_g = ztf_g.drop('catflags', axis=1)
ztf_r = ztf_r.drop('catflags', axis=1)

In [10]:
ztf_g['band'] = 'g'
ztf_r['band'] = 'r'

# Converting these to pandas DFs now.
ztf_g = ztf_g.compute()
ztf_r = ztf_r.compute()

In [11]:
ztf_g_oid_alerce = unique(ztf_g['oid_alerce'].to_numpy())
ztf_r_oid_alerce = unique(ztf_r['oid_alerce'].to_numpy())

catflags_magerr_mask_g = features_g['oid_alerce'].isin(ztf_g_oid_alerce)
catflags_magerr_mask_r = features_r['oid_alerce'].isin(ztf_r_oid_alerce)

features_g = features_g[catflags_magerr_mask_g]
features_r = features_r[catflags_magerr_mask_r]

num_catflags_magerr_g = len(features_g)
num_catflags_magerr_r = len(features_r)

print(f'Number of OIDs filtered by catflags, magerr in g:\t\t{num_unfiltered_features_g - num_catflags_magerr_g:,}')
print(f'Number of OIDs filtered by catflags, magerr in r:\t\t{num_unfiltered_features_r - num_catflags_magerr_r:,}')

Number of OIDs filtered by catflags, magerr in g:		49
Number of OIDs filtered by catflags, magerr in r:		55


In [12]:
panstarrs = read_csv(panstarrs_pointsources)

panstarrs = panstarrs[['name', 'ps_score', 'type']] # We only need these three
panstarrs = panstarrs[panstarrs['ps_score'] > 0.5] # Our point source filter
panstarrs = panstarrs.drop(columns=['ps_score']) # Don't need this anymore

In [13]:
panstarrs_duplicates = panstarrs.groupby(['name'])

different_types_found = False

for name, name_panstarrs_df in panstarrs_duplicates:
    if name_panstarrs_df['name'].nunique() > 1:
        different_types_found = True
        print('Caution!', name, 'has different reported types in the PanSTARRS DF!')

if not different_types_found:
        print('Duplicates in the PanSTARRS DF have the same types. All good!')

Duplicates in the PanSTARRS DF have the same types. All good!


In [14]:
panstarrs = panstarrs.drop_duplicates(subset='name', keep='first')
panstarrs_names = panstarrs['name'].to_numpy()

ps_mask_g = features_g['name'].isin(panstarrs_names)
ps_mask_r = features_r['name'].isin(panstarrs_names)

features_g = features_g[ps_mask_g]
features_r = features_r[ps_mask_r]

num_ps_g = len(features_g)
num_ps_r = len(features_r)

print(f'Number of OIDs filtered by ps_score in g:\t{num_catflags_magerr_g - num_ps_g:,}')
print(f'Number of OIDs filtered by ps_score in r:\t{num_catflags_magerr_r - num_ps_r:,}')

Number of OIDs filtered by ps_score in g:	0
Number of OIDs filtered by ps_score in r:	0


In [15]:
features_g = features_g.merge(panstarrs, on='name', how='left')
features_r = features_r.merge(panstarrs, on='name', how='left')

ztf_g = ztf_g.merge(panstarrs, on='name', how='left')
ztf_r = ztf_r.merge(panstarrs, on='name', how='left')

# We can now delete these
del panstarrs
del panstarrs_duplicates

In [16]:
features_g['band'] = 'g'
features_r['band'] = 'r'

features_combined = concat([features_g, features_r], ignore_index=True)
del features_g
del features_r

features_combined = features_combined.sort_values(by=['type', 'name', 'band'])

features_combined.to_csv(original_features_by_oid, index=False)
print(f'Features in g and r combined, filtered, and saved to {original_features_by_oid}')
del features_combined

Features in g and r combined, filtered, and saved to /home/shoaib/PSChallenge/original_features_by_oid.csv


In [17]:
ztf_g = ztf_g.sort_values(by=['type', 'name', 'mjd'])
ztf_r = ztf_r.sort_values(by=['type', 'name', 'mjd'])

ztf_g.to_parquet(filtered_lightcurves_gband, index=False)
ztf_r.to_parquet(filtered_lightcurves_rband, index=False)

print(f'Filtered gband lightcurves saved to {filtered_lightcurves_gband}')
print(f'Filtered rband lightcurves saved to {filtered_lightcurves_rband}')

Filtered gband lightcurves saved to /home/shoaib/PSChallenge/filtered_lightcurves_gband.parquet
Filtered rband lightcurves saved to /home/shoaib/PSChallenge/filtered_lightcurves_rband.parquet


In [18]:
all_lcs = concat([ztf_g, ztf_r], ignore_index=True)

# We don't need the individual g and r DFs anymore
del ztf_g
del ztf_r

# Let's sort this before saving
all_lcs = all_lcs.sort_values(by=['type', 'name', 'band', 'mjd'])

# Let's save the combined LC file
all_lcs.to_parquet(filtered_lightcurves, index=False)

print(f'Combined, filtered lightcurves saved to {filtered_lightcurves}')

Combined, filtered lightcurves saved to /home/shoaib/PSChallenge/filtered_lightcurves.parquet
