In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import glob
from tqdm.auto import tqdm
from typing import List

In [None]:
from lc_classifier.features.composites.elasticc import ElasticcFeatureExtractor
from lc_classifier.features.core.base import AstroObject, FeatureExtractorComposite, FeatureExtractor
from lc_classifier.features.extractors.color_feature_extractor import ColorFeatureExtractor
from lc_classifier.features.extractors.period_extractor import PeriodExtractor
from lc_classifier.features.extractors.folded_kim_extractor import FoldedKimExtractor
from lc_classifier.features.extractors.harmonics_extractor import HarmonicsExtractor
from lc_classifier.features.extractors.mhps_extractor import MHPSExtractor
from lc_classifier.features.extractors.turbofats_extractor import TurboFatsExtractor
from lc_classifier.features.extractors.spm_extractor import SPMExtractor
from lc_classifier.features.extractors.sn_extractor import SNExtractor
from lc_classifier.features.extractors.timespan_extractor import TimespanExtractor
from lc_classifier.features.extractors.coordinate_extractor import CoordinateExtractor

metacols = [
    "diaObjectId",
    "ra",
    "dec",
]

In [None]:
meta_df = pd.read_csv("/kaggle/input/lsst-dp1-anomaly-hunt/dia_objects_table.csv")
lc_df = pd.read_csv("/kaggle/input/lsst-dp1-anomaly-hunt/light_curves_table.csv")
lc_df = lc_df.sort_values(by="expMidptMJD",ascending=True)

In [None]:
objids = meta_df["diaObjectId"].sort_values(ascending=True).to_numpy()

In [None]:
objids = meta_df["diaObjectId"].sort_values(ascending=True).to_numpy()
bad_objids = [579577936487646820, 579577936487646823]
objids = np.array([x for x in objids if x not in bad_objids])


lc_df=lc_df[lc_df["diaObjectId"].isin(objids)].reset_index(drop=True)
meta_df=meta_df[meta_df["diaObjectId"].isin(objids)].reset_index(drop=True)

In [None]:
np.save("objids.npy", objids)

In [None]:
alerce_objects = []
for objid in tqdm(objids,leave=False,desc="Converting to alerce object"):

    chosen_df = lc_df[(lc_df["diaObjectId"]==objid)]

    # detections

    mjd = chosen_df["expMidptMJD"]
    band = chosen_df["band"]
    fluxcal = chosen_df["psfDiffFlux"]
    fluxcalerr = chosen_df["psfDiffFluxErr"]
    tot_pts = len(mjd)
    
    detections_data = {
        "diaObjectId": [objid] * tot_pts,
        "mjd": mjd,
        "fid": band,
        "brightness": fluxcal,
        "e_brightness": fluxcalerr,
    }
    
    detections = pd.DataFrame(detections_data)
    detections["unit"] = "diff_flux"
    
    detections["ra"] = float(meta_df[meta_df["diaObjectId"]==objid].loc[:,"ra"].values[0])
    detections["dec"] = float(meta_df[meta_df["diaObjectId"]==objid].loc[:,"dec"].values[0])
    
    detections["candid"] = None
    detections["tid"] = "LSSTDP1_telescope"
    detections["sid"] = "LSSTDP1_survey"
    detections["pid"] = "LSSTDP1_program"
    
    # metadata

    metadata = meta_df[meta_df["diaObjectId"]==objid].loc[:,metacols]
    metadata["aid"] = f"aid_{objid}"
    metadata = (metadata.T).reset_index(drop=False)
    metadata.columns = ["name", "value"]

    # make alerce object
    alerceobj = AstroObject(detections=detections,metadata=metadata)

    # save
    alerce_objects.append(alerceobj)

In [None]:
class DP1FeatureExtractor(FeatureExtractorComposite):
    def _instantiate_extractors(self) -> List[FeatureExtractor]:
        bands = list("ugrizy")
        unit = "diff_flux"

        feature_extractors = [
            ColorFeatureExtractor(bands, just_flux=True),
            MHPSExtractor(bands, unit),
            PeriodExtractor(
                bands,
                unit,
                smallest_period=0.045,
                largest_period=50.0,
                trim_lightcurve_to_n_days=500.0,
                min_length=15,
                use_forced_photo=True,
                return_power_rates=True,
            ),
            FoldedKimExtractor(bands, unit),
            HarmonicsExtractor(bands, unit, use_forced_photo=False),
            TurboFatsExtractor(bands, unit),
            SPMExtractor(
                bands,
                unit,
            ),
            SNExtractor(bands, unit, use_forced_photo=False),
            TimespanExtractor(),
            CoordinateExtractor(),
        ]
        return feature_extractors

In [None]:
feature_extractor = DP1FeatureExtractor()
feature_extractor.compute_features_batch(alerce_objects, progress_bar=True)

In [None]:
featvals = np.zeros(shape=(len(alerce_objects),419))

for it, alerce_object in enumerate(tqdm(alerce_objects,leave=False, desc="Calculating features")):
    curfeatures_df = alerce_object.features.reset_index(drop=True)
    featvals[it,:]=curfeatures_df.loc[:,"value"].to_numpy()

In [None]:
curfeatures_df.loc[415:,"fid"] = ["nonfilter","nonfilter","nonfilter","nonfilter"]
curfeatures_df["name"] = ((curfeatures_df["name"]+"_"+curfeatures_df["fid"].str.replace(",","")).str.replace("_ugrizY","")).str.replace("_nonfilter","")
featnames = curfeatures_df["name"].to_numpy()

features_df = pd.DataFrame(featvals, index=objids, columns=featnames)
features_df.index.name = "diaObjectId"

In [None]:
features_df.to_parquet(f"dp1_features_sc.parquet")
curfeatures_df.loc[:,["name","fid", "sid", "version"]].to_csv(f"dp1_features_info_{start}-{end-1}.csv",index=False)