In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import glob
from tqdm.auto import tqdm

In [2]:
from lc_classifier.features.composites.elasticc import ElasticcFeatureExtractor
from lc_classifier.features.core.base import AstroObject

In [3]:
def get_detections(chosen_df, snid):
    mjd = chosen_df["MJD"]
    band = chosen_df["BAND"]
    fluxcal = chosen_df["FLUXCAL"]
    fluxcalerr = chosen_df["FLUXCALERR"]
    photflag = chosen_df["PHOTFLAG"]

    tot_pts = len(mjd)

    detections_data = {
        "SNID": [snid] * tot_pts,
        "MJD": mjd,
        "BAND": band,
        "FLUXCAL": fluxcal,
        "FLUXCALERR": fluxcalerr,
        "PHOTFLAG": photflag,
    }

    detections = (
        pd.DataFrame(detections_data).sort_values(by="MJD").reset_index(drop=True)
    )

    # Ensure numeric dtypes
    # detections["MJD"] = pd.to_numeric(detections["MJD"], errors='coerce')
    # detections["FLUXCAL"] = pd.to_numeric(detections["FLUXCAL"], errors='coerce')
    # detections["FLUXCALERR"] = pd.to_numeric(detections["FLUXCALERR"], errors='coerce')
    # detections["PHOTFLAG"] = pd.to_numeric(detections["PHOTFLAG"], errors='coerce')

    return detections


def get_meta(chosen_df, snid):
    metacols = [
        "SNID",
        "RA",
        "DEC",
        "SNTYPE",
        "NOBS",
        "PTROBS_MIN",
        "PTROBS_MAX",
        "MWEBV",
        "MWEBV_ERR",
        "REDSHIFT_HELIO",
        "REDSHIFT_HELIO_ERR",
        "REDSHIFT_FINAL",
        "REDSHIFT_FINAL_ERR",
        "VPEC",
        "VPEC_ERR",
    ]

    names = chosen_df.loc[metacols].index.to_numpy()
    values = chosen_df.loc[metacols].to_numpy()
    names = np.append(names, "aid")
    values = np.append(values, np.nan)
    dat = np.stack([names, values]).T
    metadata = pd.DataFrame(data=dat, columns=["name", "value"])
    return metadata


def get_sniddata(df_parquet, snid):
    chosen_df = df_parquet[df_parquet["SNID"] == snid].iloc[0]
    detections = get_detections(chosen_df, snid)
    metadata = get_meta(chosen_df, snid)

    return detections, metadata

In [4]:
def alerce_lcobj(detections, metadata):
    # assert detections["SNID"].unique().shape[0] == 1
    detections.rename(
        columns={
            "MJD": "mjd",
            "BAND": "fid",
            "FLUXCAL": "brightness",
            "FLUXCALERR": "e_brightness",
        },
        inplace=True,
    )
    detections["candid"] = None
    detections["tid"] = "elasticc_telescope"
    detections["sid"] = "elasticc_survey"
    detections["pid"] = "elasticc_program"

    detections["ra"] = float(metadata[metadata["name"] == "RA"]["value"].values[0])
    detections["dec"] = float(metadata[metadata["name"] == "DEC"]["value"].values[0])
    detections["unit"] = "diff_flux"

    is_detected = detections["PHOTFLAG"] > 0
    detections.drop(columns=["PHOTFLAG"], inplace=True)
    forced_photometry = detections[~is_detected]
    detections = detections[is_detected]

    astro_object = AstroObject(
        detections=detections, forced_photometry=forced_photometry, metadata=metadata
    )

    return astro_object


def alerce_feature_listobjs(
    astro_objects, feature_extractor=ElasticcFeatureExtractor(), progress_bar=False
):
    # astro_objects is list of astro_object e.g. [astro_object1, astro_object2, etc.]
    feature_extractor.compute_features_batch(
        [elasticc_object], progress_bar=progress_bar
    )
    return astro_objects

In [5]:
pq_filenames = ["Cepheid", "d-Sct", "EB", "RRL"]

# Cepheid: (1662,)
# d-Sct: (8245,)
# EB: (98473,)
# RRL: (45096,)

In [6]:
for pq_filename in tqdm(pq_filenames, desc="Parquet file #"):
    df = pd.read_parquet(f"/kaggle/input/elasticc2-parquet/{pq_filename}.parquet")
    snids = df["SNID"].to_numpy()

    elasticc_objects = []
    for snid in tqdm(snids, leave=False, desc="Converting to alerce object"):
        detections, metadata = get_sniddata(df, snid)
        elasticc_object = alerce_lcobj(detections, metadata)
        elasticc_objects.append(elasticc_object)

    feature_extractor = ElasticcFeatureExtractor()
    feature_extractor.compute_features_batch(elasticc_objects, progress_bar=True)

    featvals = np.zeros(shape=(len(elasticc_objects), 419))

    for it, elasticc_object in enumerate(
        tqdm(elasticc_objects, leave=False, desc="Calculating features")
    ):
        curfeatures_df = elasticc_object.features.reset_index(drop=True)
        featvals[it, :] = curfeatures_df.loc[:, "value"].to_numpy()

    curfeatures_df.loc[415:, "fid"] = [
        "nonfilter",
        "nonfilter",
        "nonfilter",
        "nonfilter",
    ]

    curfeatures_df["name"] = (
        (
            curfeatures_df["name"] + "_" + curfeatures_df["fid"].str.replace(",", "")
        ).str.replace("_ugrizY", "")
    ).str.replace("_nonfilter", "")

    featnames = curfeatures_df["name"].to_numpy()

    features_df = pd.DataFrame(featvals, index=snids, columns=featnames)

    features_df.to_parquet(f"../data/{pq_filename}_features.parquet")
    curfeatures_df.loc[:, ["name", "fid", "sid", "version"]].to_csv(
        f"../data/{pq_filename}_features_info.csv", index=True
    )