### Build Damped Random Walk Kernel

In [None]:
# GPR kernel 
try: 
    import celerite
except:
    !pip install celerite
import autograd.numpy as np
import celerite
from celerite import terms
from celerite import GP
from scipy.optimize import minimize
import matplotlib.pyplot as plt 
import matplotlib

class DRW_kernel(terms.Term):
    # build a Damped Random Walk kernel, a means the standard variance, c means the characteristic timescale
    
	parameter_names = ("log_a", "log_c")
	
	def get_real_coefficients(self, params):
		log_a, log_c = params
        
		return np.exp(log_a)**2, 1.0/np.exp(log_c)

def neg_log_like(params, y, gp):
    gp.set_parameter_vector(params)
    return -gp.log_likelihood(y)

def grad_neg_log_like(params, y, gp):
    gp.set_parameter_vector(params)
    return -gp.grad_log_likelihood(y)[1]

def GP_lc(mjd, band, band_err, sigma_in, tau_in):
    
#     print("True SF infinte: ", sigma_in*np.sqrt(2))
#     print("True Tau: ", tau_in)
    kernel = DRW_kernel(log_a = np.log(sigma_in), log_c = np.log(tau_in))
    gp = GP(kernel, mean=np.mean(band))
    gp.compute(mjd, band_err)  

#     print("Initial log likelihood: {0}".format(gp.log_likelihood(band)))  
#     print("parameter_dict:\n{0}\n".format(gp.get_parameter_dict()))
#     print("parameter_names:\n{0}\n".format(gp.get_parameter_names()))
#     print("parameter_vector:\n{0}\n".format(gp.get_parameter_vector()))
#     print("parameter_bounds:\n{0}\n".format(gp.get_parameter_bounds()))
    
    # set parameter bounds
    sigma_bounds = [0.01, 200]/np.sqrt(2)
    tau_bounds = [1,60000]
    loga_bounds = (np.log(min(sigma_bounds)), np.log(max(sigma_bounds)))
    logc_bounds= (np.log(min(tau_bounds)), np.log(max(tau_bounds)) )
    bounds = [loga_bounds, logc_bounds]
#     print(bounds)
    
    #find the maximum likelihood parameters for this model
    initial_params = gp.get_parameter_vector()
#     bounds = gp.get_parameter_bounds(bounds)
#     print(bounds)
    soln = minimize(neg_log_like, initial_params, method="L-BFGS-B", bounds=bounds, args=(band, gp))
    gp.set_parameter_vector(soln.x)
    
    a_out = gp.get_parameter_dict()['kernel:log_a']
    c_out = gp.get_parameter_dict()['kernel:log_c']
    SF_inft = np.sqrt(2)*np.exp(a_out)
    tau_out = np.exp(c_out)
    
#     print("Estimated SF infinte: ", SF_inft)
#     print("Estimated Tau: ", tau_out)
    return gp, SF_inft, tau_out

def predict_lc(gp, mjd, band_name, band, band_err):
    continuous_mjd = np.linspace(min(mjd),max(mjd),2000)
    pred_mean, pred_var = gp.predict(band, continuous_mjd, return_var=True)
    pred_std = np.sqrt(pred_var)
   
    #plot the predicted light curve
    color = "#ff7f0e"
    plt.figure(figsize = (20,10))
    plt.errorbar(mjd, band, yerr=band_err, fmt=".k", capsize=0)
    plt.plot(continuous_mjd, pred_mean, color=color)
    plt.fill_between(continuous_mjd, pred_mean+pred_std, pred_mean-pred_std, color=color, alpha=0.3, edgecolor="none")
    plt.xlabel("mjd (days)",fontsize=20)
    plt.ylabel("flux",fontsize=20)
    plt.title(band_name+' band maximum likelihood prediction',fontsize=20)
    plt.show()
    return pred_mean, pred_std

### Select AGN from the training dataset

In [None]:
import pandas as pd
import numpy as np
import csv

data = pd.read_csv('../input/PLAsTiCC-2018/training_set.csv')
meta = pd.read_csv('../input/PLAsTiCC-2018/training_set_metadata.csv')
agn_meta = meta[(meta.ddf<1)& (meta.target == 88)]
agn_data = data[(data.object_id.isin(agn_meta['object_id'].tolist()))]
del data, meta
modelpar = pd.read_csv('../input/plasticc-converted-datasets/plasticc_modelpar_088_AGN.csv')

sf_name = {'u':'SFU','g':'SFG','r':'SFR','i':'SFI','z':'SFZ','y':'SFY'}
band_name = {'u':0,'g':1,'r':2,'i':3,'z':4,'y':5}
combine = agn_data.groupby('object_id')
id_list = []
for _id, info in combine:
    id_list.append(_id)

# create a file storing the EM results
drw_file = open('drw_training_agn_redshift.csv','w')
wfile = csv.writer(drw_file)
wfile.writerow(['object_id', 'passband','tau_in','SF_in','tau_out','SF_out'])

# id_list = id_list[7:9]
for i in id_list:
    agn_obj = agn_data[(agn_data.object_id == i)]
    paras = modelpar[(modelpar.object_id == i)]
    true_tau = paras.iloc[0].at['TAU']
    z = paras.iloc[0].at['REDSHIFT']
    for band in ['u','g','r','i','z','y']:     
        try:
            band_info = agn_obj[(agn_obj.passband==band_name[band])]
            t_obs = band_info.mjd.values
            t_obs = t_obs - band_info.mjd.min()
            t_obs = t_obs/(z+1.0)
            flux = band_info.flux.values - band_info.flux.mean()
            flux_err = band_info.flux_err.values
            true_SF = paras.iloc[0].at[sf_name[band]]
            gp, SF_out, tau_out = GP_lc(t_obs, flux, flux_err, true_SF/np.sqrt(2), true_tau)
            wfile.writerow([i, band, true_tau, true_SF, tau_out, SF_out])
        #   predict_lc(gp, t_obs,band,flux,flux_err)
        except:
            wfile.writerow([i, band, true_tau, true_SF, None, None])
            continue
    

### Select AGN from test sets

In [None]:
import pandas as pd
import numpy as np
import csv
import os
from multiprocessing import Process
from multiprocessing import Manager
from multiprocessing import Pool
import multiprocessing as mp
import time

sf_name = {'u':'SFU','g':'SFG','r':'SFR','i':'SFI','z':'SFZ','y':'SFY'}
band_name = {'u':0,'g':1,'r':2,'i':3,'z':4,'y':5}
    
def process_work(id_list, agn_data, modelpar):
    print('process id:', os.getpid())
#     print('agn_data ',os.getpid(), agn_data)
#     print('modelpar ',os.getpid(), modelpar)
#     print('id list ',os.getpid(),id_list)
    row = pd.DataFrame(columns = ['object_id', 'passband','tau_in','SF_in','tau_out','SF_out'])
    for i in id_list:
        agn_obj = agn_data[(agn_data.object_id == i)]
        paras = modelpar[(modelpar.object_id == i)]
        true_tau = paras.iloc[0].at['TAU']
#         z = paras.iloc[0].at['REDSHIFT']
        for band in ['u','g','r','i','z','y']:     
            try:
                band_info = agn_obj[(agn_obj.passband==band_name[band])]
                t_obs = band_info.mjd.values
                t_obs = t_obs - band_info.mjd.min()
#                 t_obs = t_obs/(z+1.0)
                flux = band_info.flux.values - band_info.flux.mean()
                flux_err = band_info.flux_err.values
                true_SF = paras.iloc[0].at[sf_name[band]]
                gp, SF_out, tau_out = GP_lc(t_obs, flux, flux_err, true_SF/np.sqrt(2), true_tau)
                row = row.append({'object_id':i, 'passband':band, 'tau_in':true_tau, 'SF_in':true_SF, 'tau_out':tau_out, 'SF_out':SF_out},ignore_index=True)
#                 print(row)
#                 print({'object_id':i, 'passband':band, 'tau_in':true_tau, 'SF_in':true_SF, 'tau_out':tau_out, 'SF_out':SF_out})
            except:
                row = row.append({'object_id':i, 'passband':band, 'tau_in':true_tau, 'SF_in':true_SF, 'tau_out':None, 'SF_out':None},ignore_index=True)
#                 print(row)
                continue
                    
    print(os.getpid(), ' finished!')
#     print(os.getpid(), row)
    return row

def calculate_drw(batch, agn_meta_obj, modelpar):

    
    data = pd.read_csv('../input/plasticcunblindeddatasets/plasticc_test_set_batch'+str(batch)+'.csv')
    agn_data = data[(data.object_id.isin(agn_meta_obj))]
    del data

    if len(agn_data)!=0:
        
        combine = agn_data.groupby('object_id')
        id_list = []
        for _id, info in combine:
            id_list.append(_id)
        print('total id_list num: ', len(id_list)) 
        
        # divide the dataframe into N sub dataframes to N processes
        Process_num = 4
        chunk = len(id_list)//Process_num
        idx = 0
        
        params = []
        
        for p in np.arange(0,Process_num,1): 
            if p != Process_num-1:
                sub_list = id_list[idx:idx+chunk]
                idx += chunk
            else:
                sub_list = id_list[idx:]
                
            agn_chunk = agn_data[(agn_data.object_id.isin(sub_list))]
            params.append([sub_list, agn_chunk, modelpar])
        del agn_data
        p = Pool(mp.cpu_count())
        rows = p.starmap(process_work, params)
        p.close()
                   
        df = pd.concat(rows, ignore_index = True)
        df.to_csv('drw_test_agn_'+str(batch)+'.csv')
    
if __name__ == '__main__':
    # create a file storing the EM results


    meta = pd.read_csv('../input/plasticcunblindeddatasets/plasticc_test_metadata.csv')
    agn_meta = meta[(meta.ddf_bool<1) & (meta.true_target == 88)]
    agn_meta_obj = agn_meta['object_id'].tolist()
    del meta, agn_meta
    modelpar = pd.read_csv('../input/plasticc-converted-datasets/plasticc_modelpar_088_AGN.csv')

    for batch in np.arange(2,12,1):
        print('batch: ', batch)
        time_start=time.time()
        calculate_drw(batch, agn_meta_obj,modelpar)
        time_end=time.time()
        print('time cost',time_end-time_start,'s')


    