In [14]:
import pandas as pd
import numpy as np

In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from dataclasses import dataclass
from scipy.interpolate import CubicSpline, UnivariateSpline, Akima1DInterpolator


@dataclass
class FeaturesGeneratorByInterpolate:
    DEBUG = True
    DEBUG_N_SAMPLES = 1_00
    
    # path to csv file with data
    path_to_file: str
    # label of class
    positive_class: bool
    # number of grid points
    n_grid: int = 100
    # name of Interpolator from scipy.interpolate
    interpolator: str = "Akima1DInterpolator"

    def fit_interpolator(self, x, y):
        """Fit interpolator for initial data"""
        interpolator_object = None
        if self.interpolator == "CubicSpline":
            interpolator_object = CubicSpline(x, y)
        elif self.interpolator == "Akima1DInterpolator":
            interpolator_object = Akima1DInterpolator(x, y)
        elif self.interpolator == "UnivariateSpline":
            interpolator_object = UnivariateSpline(x, y, s=0.01)
        else:
            raise ValueError("Undefined interpolator name")
        return interpolator_object

    def get_data(self, verbose=0) -> pd.DataFrame:
        """Return dataframe with interpolated by interpolator object
        (Akima1DInterpolator by default) n_grid points."""
        columns=(
            ['id_obs', 'mag_min'] 
            + [f'mag_{i}' for i in range(self.n_grid)]
            + ['magerr_mean', 'magerr_std']
        )

        initial_data = pd.read_parquet(
            self.path_to_file
        )
        intertpolate_data = []

        if self.DEBUG:
            initial_data = initial_data[0:self.DEBUG_N_SAMPLES]

        for index, item in tqdm(initial_data.iterrows(), total=initial_data.shape[0], desc='Interpolate progress'):
            
            row = []
            # fill observation id
            if self.positive_class:
                row.append(f'{item["TIC"]}_{item["cadence_oid"]}')
            else:
                row.append(f'{item["oid"]}_{item["mjd"][0]}')

            # interpolate magnitude data
            x = item['mjd']
            y = item['mag']
            # subtract a minimum magnitude for normalize
            y_min = y.min()
            y = y - y_min
            row = row + [y_min]
            # check input data
            if len(x) != len(y):
                if verbose == 1 and len(x) != len(y):
                    print(
                        f'len time points={len(x)} not equl len mag points={len(y)}'
                        f' in data for obs_id={row[0]} object'
                    )
                    continue
            interpolator = self.fit_interpolator(x, y)
            xnew = np.linspace(x.min(), x.max(), self.n_grid)
            ynew = interpolator(xnew)
            # add features to output list
            row+= list(ynew)

            # interpolate magerror data
            errors = item['magerr']
            errors_mean = errors.mean()
            errors_std = errors.std()
            row+=[errors_mean, errors_std]

            intertpolate_data.append(row)
        
        data = pd.DataFrame(data=intertpolate_data, columns=columns)
        return data

In [18]:
FE = FeaturesGeneratorByInterpolate(
        path_to_file="../data/light-curves/lc_1M.parquet",
        n_grid = 100,
        positive_class = False
    )
data = FE.get_data()
print(data.head())
data.to_parquet('../data/light-curves/negative_class_iterpolator.parquet')

Interpolate progress: 100%|██████████| 100/100 [00:00<00:00, 491.26it/s]


                        id_obs    mag_min     mag_0     mag_1     mag_2  \
0  606207400005592_58784.42586  18.012810  0.135761  0.147022  0.156183   
1  333205400002686_58314.25815  17.734650  0.124270  0.083282  0.163471   
2   641209100026465_58439.1402  17.458036  0.138110  0.129160  0.123720   
3  282214400186519_58308.25935  16.032854  0.203590  0.178506  0.337038   
4  384201200148025_58316.28859  18.691303  0.547222  0.933766  1.166766   

      mag_3     mag_4     mag_5     mag_6     mag_7  ...    mag_92    mag_93  \
0  0.163484  0.169161  0.173454  0.176541  0.178418  ...  0.181165  0.189475   
1  0.264737  0.315049  0.336934  0.331111  0.290368  ...  0.164272  0.283769   
2  0.121624  0.122706  0.126799  0.133736  0.145443  ...  0.077148  0.117845   
3  0.106763  0.179111  0.324061  0.443431  0.441464  ...  0.255546  0.385760   
4  1.250209  1.264061  1.127910  0.345899  0.870617  ...  0.471589  0.350938   

     mag_94    mag_95    mag_96    mag_97    mag_98    mag_99  mager

In [9]:
FE = FeaturesGeneratorByInterpolate(
        path_to_file="../data/light-curves/generated_1m_cleaned.parquet",
        n_grid = 100,
        positive_class = True
    )
data = FE.get_data()
print(data.head())
data.to_parquet('../data/light-curves/positive_class_iterpolator.parquet')

Interpolate progress: 100%|██████████| 100/100 [00:00<00:00, 1616.30it/s]


                      id_obs    mag_min     mag_0     mag_1     mag_2  \
0   25081629_279205200027182  17.298798  0.145091  0.085717  0.086209   
1  149216532_437211300030064  16.811097  0.458643  0.518353  0.538753   
2  300037002_383206200105601  19.803655  0.511906  0.712178  0.701132   
3  166739932_330209400077069  17.345351  0.834028  0.891835  0.900227   
4  471015484_334207300000612  16.780750  0.087576  0.091065  0.036723   

      mag_3     mag_4     mag_5     mag_6     mag_7  ...  magerr_90  \
0  0.087488  0.089417  0.091857  0.094669  0.097716  ...   0.020489   
1  0.556065  0.570378  0.581775  0.590343  0.596168  ...   0.019079   
2  0.643646  0.596030  0.613568  0.694103  0.775169  ...   0.139413   
3  0.905895  0.891594  0.855325  0.864469  0.862536  ...   0.025904   
4 -0.000290  0.006019  0.023536  0.043364  0.061827  ...   0.024394   

   magerr_91  magerr_92  magerr_93  magerr_94  magerr_95  magerr_96  \
0   0.021162   0.023474   0.024302   0.020759   0.019061   0.02