In [1]:
import pandas as pd
import numpy as np
from time import process_time 

### Step 1: Read dataframe and merge them

In [2]:
data_location=r"D:\Siddharth Data\PLAsTiCC\Raw Data"

In [3]:
df=pd.read_csv(fr"{data_location}\training_set.csv")
df_metadata=pd.read_csv(fr"{data_location}\training_set_metadata.csv")

In [4]:
df=df.merge(df_metadata)

In [5]:
df

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,59750.4229,2,-544.810303,3.622952,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,615,59750.4306,1,-816.434326,5.553370,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92
2,615,59750.4383,3,-471.385529,3.801213,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92
3,615,59750.4450,4,-388.984985,11.395031,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92
4,615,59752.4070,2,-681.858887,4.041204,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1421700,130779836,60555.9838,4,-39.881969,46.477093,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6
1421701,130779836,60560.0459,1,14.894439,18.947685,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6
1421702,130779836,60571.0225,5,30.593130,50.695290,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6
1421703,130779836,60585.9974,4,-23.471439,44.819859,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6


In [6]:
del(df_metadata)

### Step 2: Compute (cumulative) time difference between each observation for each object

In [7]:
#Calculate time diff between all observations in mjd
df["mjd_diff"]=df['mjd'].diff()
df["mjd_diff"]=df["mjd_diff"].fillna(0)

In [8]:
#Find indexes where new objects appear, and set the mjd_diff for this to 0
obj_change_index=np.where(df["object_id"].values[:-1] != df["object_id"].values[1:])[0] + 1
df.loc[obj_change_index, ['mjd_diff']]=0

In [9]:
# Use groupby method to find seperate cumsums for all objects
df["cumulative_mjd_diff"]=df.loc[:,["object_id", "mjd_diff"]].groupby("object_id").cumsum()

### Step 3: Convert given targets to standard form (0-14) 

In [10]:
#Use dictionary to create new column, replacing targets(6,15,...) by class(0,1,...)
target_dict ={
    6: 0,
    15: 1,
    16: 2,
    42: 3,
    52: 4,
    53: 5,
    62: 6,
    64: 7,
    65: 8,
    67: 9,
    88: 10,
    90: 11,
    92: 12,
    95: 13,
    99: 14
}
df["target_class"]=df.loc[:,["target"]].replace(target_dict)

### Step 4: Group MJDs within 1 night of each other

In [11]:
df

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,mjd_diff,cumulative_mjd_diff,target_class
0,615,59750.4229,2,-544.810303,3.622952,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92,0.0000,0.0000,12
1,615,59750.4306,1,-816.434326,5.553370,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92,0.0077,0.0077,12
2,615,59750.4383,3,-471.385529,3.801213,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92,0.0077,0.0154,12
3,615,59750.4450,4,-388.984985,11.395031,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92,0.0067,0.0221,12
4,615,59752.4070,2,-681.858887,4.041204,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92,1.9620,1.9841,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1421700,130779836,60555.9838,4,-39.881969,46.477093,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6,7.0049,845.5708,0
1421701,130779836,60560.0459,1,14.894439,18.947685,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6,4.0621,849.6329,0
1421702,130779836,60571.0225,5,30.593130,50.695290,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6,10.9766,860.6095,0
1421703,130779836,60585.9974,4,-23.471439,44.819859,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6,14.9749,875.5844,0


In [12]:
mjd_arr=df["mjd"].values
time_diff_arr=df["mjd_diff"].values
grouped_mjd_arr=np.zeros_like(mjd_arr)

In [13]:
prev_time=0
for i in range(len(mjd_arr)):
    current_time=mjd_arr[i]
    time_diff=time_diff_arr[i]
    if time_diff==0 or current_time-prev_time>0.33:
        grouped_mjd_arr[i]=current_time
        prev_time=current_time
    else:
        grouped_mjd_arr[i]=prev_time

In [14]:
df["grouped_mjd"]=grouped_mjd_arr

In [15]:
df

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,ra,decl,gal_l,gal_b,...,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,mjd_diff,cumulative_mjd_diff,target_class,grouped_mjd
0,615,59750.4229,2,-544.810303,3.622952,1,349.046051,-61.943836,320.796530,-51.753706,...,0.0,0.0,0.0,,0.017,92,0.0000,0.0000,12,59750.4229
1,615,59750.4306,1,-816.434326,5.553370,1,349.046051,-61.943836,320.796530,-51.753706,...,0.0,0.0,0.0,,0.017,92,0.0077,0.0077,12,59750.4229
2,615,59750.4383,3,-471.385529,3.801213,1,349.046051,-61.943836,320.796530,-51.753706,...,0.0,0.0,0.0,,0.017,92,0.0077,0.0154,12,59750.4229
3,615,59750.4450,4,-388.984985,11.395031,1,349.046051,-61.943836,320.796530,-51.753706,...,0.0,0.0,0.0,,0.017,92,0.0067,0.0221,12,59750.4229
4,615,59752.4070,2,-681.858887,4.041204,1,349.046051,-61.943836,320.796530,-51.753706,...,0.0,0.0,0.0,,0.017,92,1.9620,1.9841,12,59752.4070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1421700,130779836,60555.9838,4,-39.881969,46.477093,0,301.992188,-17.426323,25.102988,-24.511101,...,0.0,0.0,0.0,,0.091,6,7.0049,845.5708,0,60555.9838
1421701,130779836,60560.0459,1,14.894439,18.947685,0,301.992188,-17.426323,25.102988,-24.511101,...,0.0,0.0,0.0,,0.091,6,4.0621,849.6329,0,60560.0459
1421702,130779836,60571.0225,5,30.593130,50.695290,0,301.992188,-17.426323,25.102988,-24.511101,...,0.0,0.0,0.0,,0.091,6,10.9766,860.6095,0,60571.0225
1421703,130779836,60585.9974,4,-23.471439,44.819859,0,301.992188,-17.426323,25.102988,-24.511101,...,0.0,0.0,0.0,,0.091,6,14.9749,875.5844,0,60585.9974


In [16]:
del(grouped_mjd_arr)
del(time_diff_arr)
del(mjd_arr)

### Step 5: For observations of the same passband within 1 night of each other, choose the one with least flux_err