### 1. test plasticc data structure

the question is how to define the length of the time sequence while feeding the training data ?

Take 'simple' format as an example. If we need to consider time dilation? Assume that we only use the luminosity variability data, without the knowledge of redshift. We have to set the identical length of our train and test light curves. For quasar classification, the point is not about where is the start, but how to extract the most useful part of their original data.

Given the plasticc data structure, for each mjd, about one band's data is given and other bands' are blank. Then how to consider these vacancies?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

def diff_mjd(x):
    return x.max()-x.min()

def early_mjd(x):
    return x.min()

def late_mjd(x):
    return x.max()

data = pd.read_csv('../input/plasticc-converted-datasets/training_set_converted.csv')
data = data[['mjd','id']]
combine = data.groupby('id')

diff = combine.agg(diff_mjd)
early = combine.agg(early_mjd)
late = combine.agg(late_mjd)

mjds = data['mjd']
print('latest mjd: ',max(mjds))
print('earlies mjd: ',min(mjds))

n_bins = 5
plt.figure(figsize=(12, 8), dpi=100)
plt.hist(diff['mjd'],bins=n_bins)
plt.xlabel('mjd difference', fontsize=16)
plt.ylabel('number',fontsize=16)
plt.show()

plt.figure(figsize=(12, 8), dpi=100)
plt.hist(early['mjd'],bins=n_bins)
plt.xlabel('mjd early', fontsize=16)
plt.ylabel('number',fontsize=16)
plt.show()

plt.figure(figsize=(12, 8), dpi=100)
plt.hist(late['mjd'],bins=n_bins)
plt.xlabel('mjd late', fontsize=16)
plt.ylabel('number',fontsize=16)
plt.show()
print('max length of the light curve: ',max(diff['mjd']), '\n min length: ', min(diff['mjd']))
print('training objects number: ',len(combine))


### 2. light curve analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import random


data = pd.read_csv('../input/plasticc-converted-datasets/test_set_converted_batch2_2.csv')
combine = data.groupby('id')
id_list = []
for _id,_group in combine:
    id_list.append(_id)

# random select one object's light curve
# ram_id = random.choice(id_list)
ram_id =2744261
print(ram_id)
obj_data = data[(data['id']==ram_id)]


time = obj_data['mjd']


plt.figure(figsize = (20,10))

plt.errorbar(time, obj_data['g'],yerr=obj_data['g_err'], alpha = 0.6,fmt='o',ecolor='green',color='green',elinewidth=2,capsize=3,ms=5,label = 'g')
plt.errorbar(time, obj_data['u'],yerr=obj_data['u_err'], alpha = 0.6,fmt='o',ecolor='blue',color='blue',elinewidth=2,capsize=3,ms=5,label = 'u')
plt.errorbar(time, obj_data['r'],yerr=obj_data['r_err'], alpha = 0.6,fmt='o',ecolor='red',color='red',elinewidth=2,capsize=3,ms=5,label = 'r')
plt.errorbar(time, obj_data['i'],yerr=obj_data['i_err'], alpha = 0.6,fmt='o',ecolor='orange',color='orange',elinewidth=2,capsize=3,ms=5,label = 'i')
plt.errorbar(time, obj_data['z'],yerr=obj_data['z_err'], alpha = 0.6,fmt='o',ecolor='black',color='black',elinewidth=2,capsize=3,ms=5,label = 'z')
plt.errorbar(time, obj_data['y'],yerr=obj_data['y_err'], alpha = 0.6,fmt='o',ecolor='purple',color='purple',elinewidth=2,capsize=3,ms=5,label = 'y')

plt.legend(loc="upper left",fontsize=24)
plt.xlabel("Time (days)",fontsize=24)
plt.ylabel("Flux",fontsize=24)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
# plt.gca().invert_yaxis()

In [None]:
obj_data

It is easy to recognize that all light curves have two-three big gaps, which is caused by the switch of telescope's observe erea. From these, we could still build 'group' format, but with more flexible boundary dates.
The next step is to calculate the suitable gap. Btw, the boundary limitation should also be improved in our original codes. 

In [None]:
# compare with the original data
data = pd.read_csv('../input/PLAsTiCC-2018/test_set_batch2.csv')
ram_id =2744261
obj_data = data[(data['object_id']==ram_id)]
obj_data
# okay,my convert file has problem! I remember in STATA, there is a method to convert the values of a column to header names. 
# That would be the quickest way to convert files without errors!!!

### 3. mjd gaps analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import random


data = pd.read_csv('../input/plasticc-converted-datasets/training_set_converted.csv')
combine = data.groupby('id')
id_list = []
for _id,_group in combine:
    id_list.append(_id)

delta_time = []
observation_time = []
for i in id_list:
    ind_list = data[data.id == i].index.tolist()
    mjd_exit_list = data['mjd'][ind_list].tolist()
    m = 0
    while m<len(mjd_exit_list)-1:
        delta_time.append(mjd_exit_list[m+1]-mjd_exit_list[m])
        m +=1
    observation_time.append(mjd_exit_list[-1]-mjd_exit_list[0])


# plot

plt.figure(figsize = (8,8))
# plt.subplot(1,2,1)
plt.hist(x=delta_time,bins=10**np.arange(0,3,0.1))# bins=10**np.arange(0,3,0.1)
plt.xlabel(r'$\Delta$ t (days)', fontsize=20)
plt.ylabel('Number', fontsize=20)
plt.xscale('log')
plt.savefig('delta_time.png')

from the plot we could see, delta t larger than 10^2 could be the gap between each season. In this way, we could estimate the suitable gap for each light curve.

In [None]:
# plot the observation times for each object
plt.figure(figsize = (8,8))
obj_list = combine['id'].count().tolist()
plt.hist(x=obj_list)
plt.xlabel('data length', fontsize=20)
plt.ylabel('Number', fontsize=20)
plt.savefig('data_length.png')

### 4. find suitable gap length

In [None]:
def remove_alone_mjd(id_list = list, data = list, check_delta = 300, min_size = 3):
	'''
	This function is used to remove the data points whose mjd is far away from other data points.
	These alone data pionts is not useful for group format and season format input data.
	Input:
	- id_list: the id list of all objects
	- data: the data from the preprocessed file
	- check_delta: if the difference between two neighboring data points' mjd is larger than this value, the earlier one will be removed.
	- min_size: the minimal size of the group before padding
	Returns:
	- data: the modified data
	'''

	record_row = []

	for i in id_list:
		ind_list = data[data.object_id == i].index.tolist()
		mjd_exit_list = data['mjd'][ind_list].tolist()
		warn_index = []
		n = 0
		while n+1<len(mjd_exit_list):
			delta = mjd_exit_list[n+1]-mjd_exit_list[n]

			if delta>check_delta:
# 				print('alone: ',delta) #test
				warn_index.append(ind_list[n])
			if n == 0 and delta>260:
				record_row.append(ind_list[0])
			n +=1
		if len(warn_index)!=0:
			t = 0
			while t+1<len(warn_index):
				if t == 0:
					if warn_index[0]-ind_list[0]<=min_size: record_row+=list(range(ind_list[0],warn_index[0]+1))

				if warn_index[t+1]-warn_index[t]<=min_size: 
					record_row += list(range(warn_index[t]+1, warn_index[t+1]+1))
				t +=1
			if len(warn_index)==1:
				if warn_index[0]-ind_list[0]<=min_size: record_row+=list(range(ind_list[0],warn_index[0]+1))
    
	if len(record_row)!= 0: print('yes there are alone mjd!')

	new_data = data.drop(index=list(set(record_row)), axis=0).reset_index(drop=True)

	return new_data




def combine_narrow_mjd(id_list = list, data=list, check_delta = 0.65):
	'''
	This function is used for combine the data points whose mjds are closed. 
	For example, 2 data points in one mjd.
	Input:
	- id_list: the id list of all objects
	- data: the data from the preprocessed file
	- check_delta: if the difference between two neighboring data points' mjd is smaller than this value, the later one will be removed.
	Returns:
	- data: the modified data

	'''
	record_row = []
	for i in id_list:
		ind_list = data[data.object_id == i].index.tolist()
		mjd_exit_list = data['mjd'][ind_list].tolist()
		n = 0
		while n+1<len(mjd_exit_list):
			delta = mjd_exit_list[n+1]-mjd_exit_list[n]
			if delta<check_delta:
				print('combine: ', delta)
				print('id: ', i, ' mjd: ', mjd_exit_list, 'mjd_n+1: ', mjd_exit_list[n+1], 'mjd_n: ', mjd_exit_list[n])
				record_row.append(ind_list[n])
			n +=1
	new_data = data.drop(index=list(set(record_row)), axis=0).reset_index(drop=True)
	return new_data


In [None]:
def suitable_gap_length(data = list):
    '''
    This function is used for light curves with non-identical gap boundaries.
    '''
    combine = data.groupby(data['object_id'])
    id_list = []
    for _id, group in combine:
        id_list.append(_id)
    
    data = remove_alone_mjd(id_list, data, check_delta = 100, min_size = 3)
#     data = combine_narrow_mjd(id_list, data, check_delta = 0.65)
    
    # find suitable gap length
    
    # assume the initial max gap length is 300
    max_gap = 300
    
    for i in id_list:
        ind_list = data[data.object_id == i].index.tolist()
        mjd_exit_list = data['mjd'][ind_list].tolist()
        m = 0
        obj_delta_time = []
        while m < len(mjd_exit_list)-1:
            delta = mjd_exit_list[m+1]-mjd_exit_list[m]
            obj_delta_time.append(delta)
            m +=1 

        while len([x for x in obj_delta_time if x >= max_gap]) < 2 and max_gap > 0:
            max_gap -=1
        print(max_gap)
#         if max_gap == 7:
#             print(i)

#     print(max_gap) 
# find it!! about 90 „ÄÅ88!
            
            
            
    
    
    

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import random


meta = pd.read_csv('../input/plasticcunblindeddatasets/plasticc_test_metadata.csv')
wdf_obj = meta[(meta.ddf_bool<1)]['object_id'].tolist()
del meta
data = pd.read_csv('../input/plasticc-converted-datasets/converted_test_batch2.csv')
data = data[(data.object_id.isin(wdf_obj))]



In [None]:
combine = data.groupby(data['object_id'])
id_list = []
for _id, group in combine:
    id_list.append(_id)
sub_num = int(len(id_list)/10)

In [None]:
sub_num

In [None]:
sub_data = data[(data.object_id.isin(id_list[:sub_num]))]
suitable_gap_length(sub_data)
