In [1]:
import pandas as pd
import numpy as np
from time import process_time 

### Step 1: Read dataframe and merge them

In [2]:
data_location=r"D:\Siddharth Data\PLAsTiCC\Raw Data"

In [3]:
df=pd.read_csv(fr"{data_location}\training_set.csv")
df_metadata=pd.read_csv(fr"{data_location}\training_set_metadata.csv")

In [4]:
df=df.merge(df_metadata)

In [5]:
df

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,59750.4229,2,-544.810303,3.622952,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,615,59750.4306,1,-816.434326,5.553370,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92
2,615,59750.4383,3,-471.385529,3.801213,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92
3,615,59750.4450,4,-388.984985,11.395031,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92
4,615,59752.4070,2,-681.858887,4.041204,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1421700,130779836,60555.9838,4,-39.881969,46.477093,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6
1421701,130779836,60560.0459,1,14.894439,18.947685,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6
1421702,130779836,60571.0225,5,30.593130,50.695290,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6
1421703,130779836,60585.9974,4,-23.471439,44.819859,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6


In [6]:
del(df_metadata)

### Step 2: Compute (cumulative) time difference between each observation for each object

In [7]:
#Calculate time diff between all observations in mjd
df["mjd_diff"]=df['mjd'].diff()
df["mjd_diff"]=df["mjd_diff"].fillna(0)

In [8]:
#Find indexes where new objects appear, and set the mjd_diff for this to 0
obj_change_index=np.where(df["object_id"].values[:-1] != df["object_id"].values[1:])[0] + 1
df.loc[obj_change_index, ['mjd_diff']]=0

In [9]:
# Use groupby method to find seperate cumsums for all objects
df["cumulative_mjd_diff"]=df.loc[:,["object_id", "mjd_diff"]].groupby("object_id").cumsum()

### Step 3: Convert given targets to standard form (0-14) 

In [10]:
#Use dictionary to create new column, replacing targets(6,15,...) by class(0,1,...)
target_dict ={
    6: 0,
    15: 1,
    16: 2,
    42: 3,
    52: 4,
    53: 5,
    62: 6,
    64: 7,
    65: 8,
    67: 9,
    88: 10,
    90: 11,
    92: 12,
    95: 13,
    99: 14
}
df["target_class"]=df.loc[:,["target"]].replace(target_dict)

### Step 4: Group MJDs within 1 night of each other

In [11]:
df

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,mjd_diff,cumulative_mjd_diff,target_class
0,615,59750.4229,2,-544.810303,3.622952,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92,0.0000,0.0000,12
1,615,59750.4306,1,-816.434326,5.553370,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92,0.0077,0.0077,12
2,615,59750.4383,3,-471.385529,3.801213,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92,0.0077,0.0154,12
3,615,59750.4450,4,-388.984985,11.395031,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92,0.0067,0.0221,12
4,615,59752.4070,2,-681.858887,4.041204,1,349.046051,-61.943836,320.796530,-51.753706,1,0.0,0.0,0.0,,0.017,92,1.9620,1.9841,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1421700,130779836,60555.9838,4,-39.881969,46.477093,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6,7.0049,845.5708,0
1421701,130779836,60560.0459,1,14.894439,18.947685,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6,4.0621,849.6329,0
1421702,130779836,60571.0225,5,30.593130,50.695290,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6,10.9766,860.6095,0
1421703,130779836,60585.9974,4,-23.471439,44.819859,0,301.992188,-17.426323,25.102988,-24.511101,0,0.0,0.0,0.0,,0.091,6,14.9749,875.5844,0


In [12]:
mjd_arr=df["mjd"].values
time_diff_arr=df["mjd_diff"].values
grouped_mjd_arr=np.zeros_like(mjd_arr)

In [13]:
prev_time=0
for i in range(len(mjd_arr)):
    current_time=mjd_arr[i]
    time_diff=time_diff_arr[i]
    if time_diff==0 or current_time-prev_time>0.33:
        grouped_mjd_arr[i]=current_time
        prev_time=current_time
    else:
        grouped_mjd_arr[i]=prev_time

In [14]:
df["grouped_mjd"]=grouped_mjd_arr

In [15]:
df

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,ra,decl,gal_l,gal_b,...,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,mjd_diff,cumulative_mjd_diff,target_class,grouped_mjd
0,615,59750.4229,2,-544.810303,3.622952,1,349.046051,-61.943836,320.796530,-51.753706,...,0.0,0.0,0.0,,0.017,92,0.0000,0.0000,12,59750.4229
1,615,59750.4306,1,-816.434326,5.553370,1,349.046051,-61.943836,320.796530,-51.753706,...,0.0,0.0,0.0,,0.017,92,0.0077,0.0077,12,59750.4229
2,615,59750.4383,3,-471.385529,3.801213,1,349.046051,-61.943836,320.796530,-51.753706,...,0.0,0.0,0.0,,0.017,92,0.0077,0.0154,12,59750.4229
3,615,59750.4450,4,-388.984985,11.395031,1,349.046051,-61.943836,320.796530,-51.753706,...,0.0,0.0,0.0,,0.017,92,0.0067,0.0221,12,59750.4229
4,615,59752.4070,2,-681.858887,4.041204,1,349.046051,-61.943836,320.796530,-51.753706,...,0.0,0.0,0.0,,0.017,92,1.9620,1.9841,12,59752.4070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1421700,130779836,60555.9838,4,-39.881969,46.477093,0,301.992188,-17.426323,25.102988,-24.511101,...,0.0,0.0,0.0,,0.091,6,7.0049,845.5708,0,60555.9838
1421701,130779836,60560.0459,1,14.894439,18.947685,0,301.992188,-17.426323,25.102988,-24.511101,...,0.0,0.0,0.0,,0.091,6,4.0621,849.6329,0,60560.0459
1421702,130779836,60571.0225,5,30.593130,50.695290,0,301.992188,-17.426323,25.102988,-24.511101,...,0.0,0.0,0.0,,0.091,6,10.9766,860.6095,0,60571.0225
1421703,130779836,60585.9974,4,-23.471439,44.819859,0,301.992188,-17.426323,25.102988,-24.511101,...,0.0,0.0,0.0,,0.091,6,14.9749,875.5844,0,60585.9974


In [16]:
del(grouped_mjd_arr)
del(time_diff_arr)
del(mjd_arr)

### Step 5: For observations of the same passband within 1 night of each other, choose the one with least flux_err

In [17]:
df = df.sort_values("flux_err").groupby(["object_id", "grouped_mjd", "passband"]).first()
df = df.reset_index()

In [18]:
df

Unnamed: 0,object_id,grouped_mjd,passband,mjd,flux,flux_err,detected,ra,decl,gal_l,...,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,mjd_diff,cumulative_mjd_diff,target_class
0,615,59750.4229,1,59750.4306,-816.434326,5.553370,1,349.046051,-61.943836,320.796530,...,1,0.0,0.0,0.0,,0.017,92,0.0077,0.0077,12
1,615,59750.4229,2,59750.4229,-544.810303,3.622952,1,349.046051,-61.943836,320.796530,...,1,0.0,0.0,0.0,,0.017,92,0.0000,0.0000,12
2,615,59750.4229,3,59750.4383,-471.385529,3.801213,1,349.046051,-61.943836,320.796530,...,1,0.0,0.0,0.0,,0.017,92,0.0077,0.0154,12
3,615,59750.4229,4,59750.4450,-388.984985,11.395031,1,349.046051,-61.943836,320.796530,...,1,0.0,0.0,0.0,,0.017,92,0.0067,0.0221,12
4,615,59752.4070,1,59752.4147,-1061.457031,6.472994,1,349.046051,-61.943836,320.796530,...,1,0.0,0.0,0.0,,0.017,92,0.0077,1.9918,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1421046,130779836,60555.9838,4,60555.9838,-39.881969,46.477093,0,301.992188,-17.426323,25.102988,...,0,0.0,0.0,0.0,,0.091,6,7.0049,845.5708,0
1421047,130779836,60560.0459,1,60560.0459,14.894439,18.947685,0,301.992188,-17.426323,25.102988,...,0,0.0,0.0,0.0,,0.091,6,4.0621,849.6329,0
1421048,130779836,60571.0225,5,60571.0225,30.593130,50.695290,0,301.992188,-17.426323,25.102988,...,0,0.0,0.0,0.0,,0.091,6,10.9766,860.6095,0
1421049,130779836,60585.9974,4,60585.9974,-23.471439,44.819859,0,301.992188,-17.426323,25.102988,...,0,0.0,0.0,0.0,,0.091,6,14.9749,875.5844,0


### Step 6: Create passband column

In [19]:
#Drop all unnecessary columns. Note : mjd_diff and cumulative_mjd_diff are dropped as cause problems when pivoting. Will recalculate later
df = df.drop(
    [
        "mjd",
        "detected",
        "ra",
        "decl",
        "gal_b",
        "gal_l",
        "mjd_diff",
        "cumulative_mjd_diff",
#         "ddf",                      #Experiment with these last 3
#         "distmod",
#         "mwebv"
    ],
    axis=1,
)

In [20]:
fixed_features=["ddf","hostgal_specz","hostgal_photoz","hostgal_photoz_err","distmod","mwebv"]
mini_df=df[["object_id"] + fixed_features ].groupby("object_id").first()

df=df.drop(mini_df,axis=1)
df = pd.pivot_table(df, index=["object_id","grouped_mjd","target","target_class"], columns=["passband"])
df.columns= [f"{tup[0]}_passband_{tup[1]}" for tup in df.columns.values]
df=df.reset_index(["grouped_mjd","target","target_class"])
df=df.join(mini_df,how="left")

del(mini_df)

In [None]:
df

In [21]:
df=df.rename(columns={"grouped_mjd": "mjd"})
df=df.reset_index()


#Calculate time diff between all observations in mjd
df["mjd_diff"]=df['mjd'].diff()
df["mjd_diff"]=df["mjd_diff"].fillna(0)
#Find indexes where new objects appear, and set the mjd_diff for this to 0
obj_change_index=np.where(df["object_id"].values[:-1] != df["object_id"].values[1:])[0] + 1
df.loc[obj_change_index, ['mjd_diff']]=0

In [22]:
#Try making the time series uniform everywhere by inputting 0 everyday when data wasn't measured

### Step 7: Convert dataframe into list of form [(nparray,target_class,obj_id)]

In [23]:
df=df.set_index(["object_id"])

In [24]:
df=df.drop(
    [
        "mjd",
        "target",
    ],
    axis=1,
)
df = df.fillna(0)
#Try filling -1 and see if performance improves

In [25]:
df

Unnamed: 0_level_0,target_class,flux_passband_0,flux_passband_1,flux_passband_2,flux_passband_3,flux_passband_4,flux_passband_5,flux_err_passband_0,flux_err_passband_1,flux_err_passband_2,flux_err_passband_3,flux_err_passband_4,flux_err_passband_5,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,mjd_diff
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
615,12,0.000000,-816.434326,-544.810303,-471.385529,-388.984985,0.000000,0.000000,5.553370,3.622952,3.801213,11.395031,0.000000,1,0.0,0.0,0.0,0.0,0.017,0.0000
615,12,0.000000,-1061.457031,-681.858887,-524.954590,-393.480225,-355.886780,0.000000,6.472994,4.041204,3.552751,3.599346,10.421921,1,0.0,0.0,0.0,0.0,0.017,1.9841
615,12,0.000000,-815.188599,-548.013550,-475.516052,-405.663818,-421.199066,0.000000,5.293019,3.462291,3.340643,3.496113,6.377517,1,0.0,0.0,0.0,0.0,0.017,14.8898
615,12,0.000000,-820.042786,-554.903198,-477.004730,-400.270386,-415.286896,0.000000,5.875329,3.927843,3.736262,3.834955,7.435979,1,0.0,0.0,0.0,0.0,0.017,2.9211
615,12,0.000000,-921.002502,-630.523682,-518.533997,-422.184509,-422.815094,0.000000,6.306800,4.333287,3.915225,4.089213,8.124096,1,0.0,0.0,0.0,0.0,0.017,9.1009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130779836,0,0.000000,0.000000,0.000000,0.000000,-39.881969,0.000000,0.000000,0.000000,0.000000,0.000000,46.477093,0.000000,0,0.0,0.0,0.0,0.0,0.091,7.0049
130779836,0,0.000000,14.894439,0.000000,0.000000,0.000000,0.000000,0.000000,18.947685,0.000000,0.000000,0.000000,0.000000,0,0.0,0.0,0.0,0.0,0.091,4.0621
130779836,0,0.000000,0.000000,0.000000,0.000000,0.000000,30.593130,0.000000,0.000000,0.000000,0.000000,0.000000,50.695290,0,0.0,0.0,0.0,0.0,0.091,10.9766
130779836,0,0.000000,0.000000,0.000000,0.000000,-23.471439,0.000000,0.000000,0.000000,0.000000,0.000000,44.819859,0.000000,0,0.0,0.0,0.0,0.0,0.091,14.9749


In [26]:
#Save some memory by converting float64 to float32 as 32 enough
for col in df.columns:
    if df[col].dtype == np.float64:
        df[col] = df[col].astype(np.float32)

In [27]:
#Recalculate time diff between all rows and set new objs to zero as before
all_obj_ids=np.unique(df.index.get_level_values(0).values)
dfarray=df.reset_index().to_numpy()
all_obj_ids_long=dfarray[0:,0]
all_labels_long=dfarray[0:,1]
obj_change_index=np.where(all_obj_ids_long[:-1] != all_obj_ids_long[1:])[0] + 1

In [28]:
tuplist=list(zip(np.insert(obj_change_index,0,0),obj_change_index))
list_of_data_arrays=[]
for tup in tuplist:
    list_of_data_arrays.append((dfarray[tup[0]:tup[1],2:],int(dfarray[tup[0],1]),int(dfarray[tup[0],0])))

In [29]:
len(list_of_data_arrays)

7847

In [30]:
list_of_data_arrays[0][0].shape

(121, 19)

In [31]:
list_of_data_arrays[0][2]

615

In [32]:
list_of_data_arrays              #REQUIRED LIST OF ARRAYS!!! Now, normalization time!

[(array([[ 0.00000000e+00, -8.16434326e+02, -5.44810303e+02, ...,
           0.00000000e+00,  1.70000009e-02,  0.00000000e+00],
         [ 0.00000000e+00, -1.06145703e+03, -6.81858887e+02, ...,
           0.00000000e+00,  1.70000009e-02,  1.98409998e+00],
         [ 0.00000000e+00, -8.15188599e+02, -5.48013550e+02, ...,
           0.00000000e+00,  1.70000009e-02,  1.48898001e+01],
         ...,
         [ 1.21411896e+02,  0.00000000e+00,  0.00000000e+00, ...,
           0.00000000e+00,  1.70000009e-02,  1.00049996e+00],
         [ 0.00000000e+00, -6.07040771e+02,  5.65598183e+01, ...,
           0.00000000e+00,  1.70000009e-02,  4.14389992e+00],
         [ 0.00000000e+00,  6.07047668e+02,  5.52150269e+02, ...,
           0.00000000e+00,  1.70000009e-02,  3.00259995e+00]]),
  12,
  615),
 (array([[ 0.00000000e+00,  7.61504221e+00,  9.11014748e+00, ...,
           4.54062996e+01,  7.00000022e-03,  0.00000000e+00],
         [ 0.00000000e+00,  3.13102794e+00,  4.95306492e+00, ...,
        

In [36]:
# import random

In [41]:
# list2=random.sample(list_of_data_arrays, len(list_of_data_arrays))

In [42]:
# list2[0]

(array([[ 4.27923822e+00,  0.00000000e+00,  0.00000000e+00, ...,
          4.06263008e+01,  1.89999994e-02,  0.00000000e+00],
        [-1.30481195e+00,  0.00000000e+00,  0.00000000e+00, ...,
          4.06263008e+01,  1.89999994e-02,  9.12699997e-01],
        [-3.11944199e+00,  0.00000000e+00,  0.00000000e+00, ...,
          4.06263008e+01,  1.89999994e-02,  1.00230002e+00],
        ...,
        [ 0.00000000e+00,  1.10979497e+00, -2.78696001e-01, ...,
          4.06263008e+01,  1.89999994e-02,  2.99830008e+00],
        [ 1.68533599e+00,  0.00000000e+00,  0.00000000e+00, ...,
          4.06263008e+01,  1.89999994e-02,  2.00040007e+00],
        [-4.53329277e+00,  0.00000000e+00,  0.00000000e+00, ...,
          4.06263008e+01,  1.89999994e-02,  9.98300016e-01]]),
 9,
 34437)

In [44]:
# trainlist=list2[:7066]
# testlist=list2[7066:]

In [43]:
# import pickle

In [45]:
# with open(r"C:\Users\Sid\Desktop\apnatrainpickle", "wb") as fp:   #Pickling
#     pickle.dump(trainlist, fp)
    
# with open(r"C:\Users\Sid\Desktop\apnavalpickle", "wb") as fp:   #Pickling
#     pickle.dump(testlist, fp)