In [1]:
# importing packages
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from astropy.coordinates import search_around_sky, SkyCoord
from astropy import units as u
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 999)

Here we match the Stripe 82 light curve data with the DR14 catalog which has the ground truth masses.  We use astropy SkyCoord function with a .5 arcsecond tolerance in matching.

In [15]:
# matching LC and mass data
dr14_BH = pd.read_csv('../../data/dr14_BH.csv')
dr14_LC = pd.read_csv('../../data/dr14_LC.csv')

# quasar catalog redshift more reliable
dr14_LC = dr14_LC.drop(columns=['z'])

# Match data attributes in the 2 data sets using astropy's SkyCoord
COORD1 = SkyCoord(dr14_BH['ra'], dr14_BH['dec'], frame='icrs', unit='deg')
COORD2 = SkyCoord(dr14_LC['ra'], dr14_LC['dec'], frame='icrs', unit='deg')
IDX1, IDX2, OTHER1, OTHER2 = search_around_sky(COORD1, COORD2, seplimit=0.5 * u.arcsec)

# Generating columns for the matched
X_TRAIN = []
for i in range(len(IDX1)):
    result = dr14_BH.iloc[IDX1[i]].append(dr14_LC.iloc[IDX2[i]])
    X_TRAIN.append(result)
X_TRAIN = pd.concat(X_TRAIN, axis=1)
X_TRAIN = X_TRAIN.T

X_TRAIN = X_TRAIN.loc[:, ~X_TRAIN.columns.str.contains('^Unnamed')]

We can compare the RA/DEC values on a few rows to confirm that the data matching happened correctly.

In [16]:
# remove repeat columns
X_TRAIN = X_TRAIN.drop(columns = ['SDSS_ID', 'spec_mjd', 'ID'])
X_TRAIN.head()

Unnamed: 0,ra,dec,Mass,z,ERR,M_i,train_id,ra.1,dec.1,u_band,g_band,r_band,i_band,z_band,ug,gr,ri,iz,zu
0,0.0067823,0.583213,8.3602,2.2711,0.0553846,-24.2618,23207,0.00685539,0.583184,23.4436,22.2629,22.2395,22.1408,21.5903,1.18065,0.02345,0.09865,0.55052,-1.85327
1,3.0712,-0.910459,9.45313,3.602,0.0963988,-27.2103,26906,3.0712,-0.910459,28.658,20.9751,20.1096,20.0341,19.92,7.68294,0.86548,0.07549,0.11414,-8.73805
2,21.0685,0.778414,8.41954,1.28229,0.0661145,-23.9201,49699,21.0685,0.778414,21.0188,21.2872,20.7565,20.9358,21.0316,-0.26844,0.53076,-0.1793,-0.09584,0.01282
3,39.6733,-0.417003,9.09312,2.37,0.148789,-26.8862,75574,39.6733,-0.417006,20.1253,19.4597,19.4202,19.338,19.0466,0.66561,0.03956,0.08212,0.2914,-1.07869
4,0.00806797,-0.240974,9.31616,2.163,0.109693,-26.1494,23208,0.00806669,-0.240971,20.3299,20.0662,19.8968,19.8698,19.6881,0.26365,0.16939,0.02705,0.1817,-0.64179


In [19]:
# convert to numeric
X_TRAIN = X_TRAIN.apply(pd.to_numeric, errors='ignore')

After matching the data and cleaning black hole masses, we are left with 20549 objects total for the ML pipeline

In [21]:
print(X_TRAIN.shape)

(20549, 19)


We split the data into an 85% training and 15% testing set.

In [22]:
# split data
train, test = train_test_split(X_TRAIN, test_size=0.15)

# check
test.shape[0] + train.shape[0] == X_TRAIN.shape[0]

True

In [23]:
X_TRAIN.to_csv('../../data/matched_dr14.csv')

In [25]:
train = train.dropna()
test = test.dropna()
train.to_csv('../../data/TRAIN_dr14.csv')
test.to_csv('../../data/TEST_dr14.csv')