## Create the SNe sample

In [None]:
import pandas as pd
from astropy.table import Table, vstack
from astropy.io import fits
import numpy as np
import math
import os.path
sys.path.append(os.path.join(os.getenv('HOME'),'workspace','galbase'))
import astropy.io.fits as pyfits
from astropy.wcs import WCS
from astropy import units as u
from astropy.utils.data import get_pkg_data_filename
from astropy.utils.console import ProgressBar
from matplotlib import pyplot as plt
import matplotlib.cm as cm
from operator import itemgetter
from astropy.coordinates import SkyCoord
import astropy.units as u
from reproject import reproject_interp
from scipy.stats import ks_2samp

### Galbase

In [None]:
galbase_table = Table.read('gal_base.fits')
galbase_df = galbase_table.to_pandas()

Take the PGCNAME, OBJNAME, and TAGS columns. Convert their datatypes from bytes to strings and remove whitespace. Also grab INCL_DEG and T columns. Combine them to create one DataFrame that will be output as a csv file.

In [None]:
galbase_info_dict =  { 'PGC'       : galbase_df['PGCNAME'].str.decode('utf-8').str.strip(),
                       'HOST'      : galbase_df['OBJNAME'].str.decode('utf-8').str.strip(),
                       'RA'        : galbase_df['RA_DEG'],
                       'DEC'       : galbase_df['DEC_DEG'],
                       'INCL'      : galbase_df['INCL_DEG'],
                       'PA'        : galbase_df['POSANG_DEG'],
                       'R25'       : galbase_df['R25_DEG'],
                       'DIST_MPC'  : galbase_df['DIST_MPC'],
                       'T'         : galbase_df['T'],
                       'VEL_REC'   : galbase_df['VHEL_KMS'],
                       'TAGS'      : galbase_df['TAGS'].str.decode('utf-8').str.strip()
    
}

galbase_info = pd.DataFrame(galbase_info_dict)
galbase_info.to_csv('samples/galbase_info.csv', index=False)

### The Open Supernova Catalog

- Cut all entries without RAs and DECs
    - Convert the RAs and DECs into degrees and average them
- Cut entries without dates
- Only keep types Ia, II, IIP, and Ibc

In [None]:
# number of entries without a discovery date
full_cat = pd.read_csv('osc_full.csv')
full_cat['Disc. Date'].isnull().sum() 

Read in the full catalog and immediately drop those without RA (assuming no RA = no Dec)

In [None]:
full_cat = pd.read_csv('osc_full.csv')
full_cat = full_cat[full_cat['R.A.'].notna()]

Functions that will convert RAs and Decs to degrees.

In [None]:
def ra_conversion(ra_time):
    """ Input:  RA in hrs:mins:secs
        Output: RA in degrees
    """
    ra_time = ra_time
    ra_time = ra_time.split(':')
    hours   = float(ra_time[0])
    mins    = float(ra_time[1])
    
    # some already have the secs converted to decimal
    try:
        secs = float(ra_time[2])
    except:
        secs = 0.0
    
    time_in_hours = hours + mins/60.0 + secs/3600.0
    ra_in_degrees = time_in_hours*15.0
    
    return(ra_in_degrees)
        

def dec_conversion(dec_time):
    """ Input:  DEC in degree:arcmin:arcsec
        Output: DEC in degrees
    """
    dec_time  = dec_time.split(':')
    degs      = float(dec_time[0])
    
    # apparently some are already in decimal degrees
    try:
        arcmins = float(dec_time[1])
    except:
        arcmins = 0.0
    
    # some already have the arcsecs converted to decimal
    try:
        arcsecs = float(dec_time[2])
    except:
        arcsecs = 0.0
    
                             # handle +/- degrees
    deg_in_decimal_degrees = abs(degs) + (arcmins/60.0) + (arcsecs/3600.0)
    
    # make the degrees negative if they were negative before
    if degs < 0:
        deg_in_decimal_degrees = np.negative(deg_in_decimal_degrees)
    
    return(deg_in_decimal_degrees)

Define a function that will:
- get rid of all entries without a discovery date
- only keep year for discovery date
- get rid of all entries without an RA or Dec
- average RAs and Decs

In [None]:
def clean_catalog(full_catalog):
    """ Input: full_catalog (the entire OSC catalog)
        Output: names, dates, mmaxs, hosts, ras, decs,
        zs, and types of each SN.
        - exclude all entries without a date
        - average RAs and Decs and convert to degrees
    """
    # create lists that will store the info
    names     = []
    dates     = []
    mmaxs     = []
    hosts     = []
    ras_deg   = []
    ras_diff  = []
    decs_diff = []
    decs_deg  = []
    zs        = []
    types     = []
    phots     = []
    specs     = []
    
    
    # loop over every single entry in the catalog; make certain cuts and clean up ras and decs
    bar = ProgressBar(len(full_catalog), ipython_widget=True)    
    for index, row in full_catalog.iterrows():
        name       = row['Name']
        orig_date  = row['Disc. Date']
        mmax       = row['mmax']
        host       = str(row['Host Name']).replace(" ", "") # get rid of whitespace to xmatch with galbase
        ra         = row['R.A.']
        dec        = row['Dec.']
        z          = row['z']
        tipe       = row['Type']
        phot       = row['Phot.']
        spec       = row['Spec.']
        
        bar.update()
        
        # get rid of entries that have no date (get rid of remnants)
        if pd.isnull(orig_date):
            continue

        # get dates into just year
        date_year = (orig_date.split('/'))[0]

        # grab the RAs and Decs; will need to fix up the RAs due to values labeled 0
        old_ras  = np.array(str(row['R.A.']).split(','))
        decs     = np.array(str(row['Dec.']).split(','))
        
        # get rid of ra entries that are just 0
        ras = np.array([val for val in old_ras if val != '00:00:00.000'])
        
        # convert ras and decs to degrees
        ras_deg_list  = [ra_conversion(ra) for ra in ras]
        decs_deg_list = [dec_conversion(dec) for dec in decs]
        
        # take the median 
        ra_deg  = np.median(ras_deg_list)
        dec_deg = np.median(decs_deg_list)
    
        # add column with difference between max coords and min coords (in degrees)
        # this is to see which entries may have very different coordinate values from different sources
        try:
            ra_diff  = max(ras_deg_list) - min(ras_deg_list)
            dec_diff = max(decs_deg_list) - min(decs_deg_list)
        except ValueError:
            ra_diff = 0
            dec_dif = 0

        # collect all data into the lists
        names     += [name]
        dates     += [date_year]
        mmaxs     += [mmax]
        hosts     += [host]
        zs        += [z]
        types     += [tipe]
        phots     += [phot]
        specs     += [spec]
        ras_deg   += [ra_deg]
        decs_deg  += [dec_deg]
        ras_diff  += [ra_diff]
        decs_diff += [dec_diff]

    return(names, dates, mmaxs, hosts, zs, types, phots, specs, ras_deg, decs_deg, ras_diff, decs_diff)

In [None]:
names, dates, mmaxs, hosts, zs, types, phots, specs, ras_deg, decs_deg, ras_diff, decs_diff = clean_catalog(full_cat)


In [None]:
clean_osc_dict = {
    'NAME'    : names,
    'DATE'    : dates,
    'HOST'    : hosts,
    'RA'      : ras_deg,
    'RA_diff' : ras_diff,
    'DEC'     : decs_deg,
    'DEC_diff': decs_diff,
    'TYPE'    : types,
    'mmax'    : mmaxs,
    'z'       : zs,
    'PHOT'    : phots,
    'SPEC'    : specs,
}

clean_osc = pd.DataFrame(clean_osc_dict, columns=clean_osc_dict.keys()) # columns arg will keep order
clean_osc = clean_osc[clean_osc['TYPE'].notnull()]
clean_osc.to_csv('samples/clean_osc_full.csv', index=False)

Drop SN types that are not Ia, II, and Ibc.

In [None]:
clean_osc = pd.read_csv('samples/clean_osc_full.csv')

In [None]:
# narrow down the parent table by SN type
bitmask = (((clean_osc.TYPE == 'Ia') |
            (clean_osc.TYPE == 'II') |
            (clean_osc.TYPE == 'II P') |
            (clean_osc.TYPE == 'II L') |
            (clean_osc.TYPE == 'II-P/L') | # just 1 SN
            (clean_osc.TYPE == 'IIb') |
            (clean_osc.TYPE == 'Ib') |
            (clean_osc.TYPE == 'Ic') |
            (clean_osc.TYPE == 'Ibc')|
            (clean_osc.TYPE == 'Ib/c')))
clean_osc = clean_osc[bitmask]

clean_osc = clean_osc[clean_osc['TYPE'].notnull()]
clean_osc.to_csv('samples/clean_osc.csv', index=False)

In [None]:
clean_osc

### Cross-match The Open Supernova Catalog with the original z0MGS images

In [None]:
def check_in_image(names, dates, hosts, ras, ras_diff, decs, decs_diff, types, zs, phots, specs, wcs):

    coords_arr = np.column_stack((ras, decs))
    world_coords_all = wcs.wcs_world2pix(coords_arr, 0)

    world_x = world_coords_all[:,0]
    world_y = world_coords_all[:,1]

    # use world coordinates of all SNe to see if any fall in image (our version of footprint_contains)
    naxis = wcs._naxis # size of image
    is_in_x = (world_x >= 0) & (world_x <= naxis[0]-1) # because of 0-indexing
    is_in_y = (world_y >= 0) & (world_y <= naxis[1]-1)
             
    # get the name, ra, and dec of the SNe that fall in image
    name_in_image       = np.array(names)[is_in_x & is_in_y]
    date_in_image       = np.array(dates)[is_in_x & is_in_y]
    host_in_image       = np.array(hosts)[is_in_x & is_in_y]
    ra_in_image         = np.array(ras)[is_in_x & is_in_y]
    dec_in_image        = np.array(decs)[is_in_x & is_in_y]
    ra_diff_in_image    = np.array(ras_diff)[is_in_x & is_in_y]
    dec_diff_in_image   = np.array(decs_diff)[is_in_x & is_in_y]
    types_in_image      = np.array(types)[is_in_x & is_in_y]
    z_in_image          = np.array(zs)[is_in_x & is_in_y]
    phot_in_image       = np.array(phots)[is_in_x & is_in_y]
    spec_in_image       = np.array(specs)[is_in_x & is_in_y]
    
    # to be used in conditional statement
    x_coord        = np.array(world_x)[is_in_x & is_in_y]
    y_coord        = np.array(world_y)[is_in_x & is_in_y]


    return(name_in_image, date_in_image, host_in_image, ra_in_image, ra_diff_in_image, dec_in_image, dec_diff_in_image, types_in_image, z_in_image, phot_in_image, spec_in_image, x_coord, y_coord)

In [None]:
def build_table(name_in_image, date_in_image, img_name, host_in_image, ra_in_image, ra_diff_in_image, 
                dec_in_image, dec_diff_in_image, types_in_image, z_in_image, phot_in_image, spec_in_image):
    t = Table()
    t['NAME']     = name_in_image
    t['DATE']     = date_in_image
    t['PGC']      = img_name
    t['HOST']     = host_in_image
    t['RA']       = ra_in_image
    t['RA_diff']  = ra_diff_in_image
    t['DEC']      = dec_in_image
    t['DEC_diff'] = dec_diff_in_image
    t['TYPE']     = types_in_image
    t['z']        = z_in_image
    t['PHOT']     = phot_in_image
    t['SPEC']     = spec_in_image
    return(t)

Right now just use clean_osc.csv, which only has types Ia, II, and Ibc. (no IIn, Ia-Pec)

In [None]:
# osc = pd.read_csv('samples/clean_osc.csv')
osc = pd.read_csv('samples/clean_osc_full.csv')
names     = osc['NAME']
dates     = osc['DATE']
hosts     = osc['HOST']
ras       = osc['RA']
ras_diff  = osc['RA_diff']
decs      = osc['DEC']
decs_diff = osc['DEC_diff']
types     = osc['TYPE']
zs        = osc['z']
phots     = osc['PHOT']
specs     = osc['SPEC']

In [None]:
table_list = []

count = 0
bar = ProgressBar(os.listdir('/data/kant/0/leroy.42/allsky/delivery/'), ipython_widget=True)
for filename in os.listdir('/data/kant/0/leroy.42/allsky/delivery/'):
    if filename.endswith('_w1_gauss7p5.fits'):

        img_dir = ('/data/kant/0/leroy.42/allsky/delivery/' + filename)
        img_name = filename.split('_')[0]
        
        if img_name == 'PGC2557':
            bar.update()
            continue
        if img_name == 'PGC5818':
            bar.update()
            continue
        
        hdulist = pyfits.open(img_dir)
        wcs = WCS(hdulist[0].header)

        # call check_in_image
        try:
            name_in_image, date_in_image, host_in_image, ra_in_image, ra_diff_in_image, dec_in_image, dec_diff_in_image, types_in_image, z_in_image, phot_in_image, spec_in_image, x_coord, y_coord = check_in_image(names, dates, hosts, ras, ras_diff, decs, decs_diff, types, zs, phots, specs, wcs)
            if len(x_coord) == 0:
                bar.update()
                continue
            else:
                t = build_table(name_in_image, date_in_image, img_name, host_in_image, ra_in_image, ra_diff_in_image, dec_in_image, dec_diff_in_image, types_in_image, z_in_image, phot_in_image, spec_in_image)
                table_list += [t]
        except:
            bar.update()
            continue
        
        hdulist.close()
    bar.update()

In [None]:
xmatch = vstack(table_list)
xmatch_df = xmatch.to_pandas()
# xmatch_df.to_csv('samples/xmatch.csv', index=False)
xmatch_df.to_csv('samples/xmatch_full.csv', index=False)
xmatch_df

#### Add columns that have host galaxy redshift, the difference between host galaxy redshift and SN redshift, and a believable vs. doubtful sample.
We want to add a column for host galaxy redshift and the difference between host galaxy redshift and SN redshift. This will allow us to make cuts based on redshift to ensure that SN are being placed in their correct host galaxies.

Add galbase information to xmatch.csv

In [None]:
# xmatch = pd.read_csv('samples/xmatch.csv')
xmatch = pd.read_csv('samples/xmatch_full.csv')
galbase  = pd.read_csv('samples/galbase_info.csv')

galbase_dict = {
    'PGC'    : galbase['PGC'],
    'RA_GAL' : galbase['RA'],
    'DEC_GAL': galbase['DEC'],
    'INCL'   : galbase['INCL'],
    'PA'     : galbase['PA'],
    'R25'    : galbase['R25'],
    'VEL'    : galbase['VEL_REC'],
    'T'      : galbase['T']
}

galbase_df = pd.DataFrame(galbase_dict)
xmatch = xmatch.merge(galbase_df, on=['PGC'])
xmatch

In [None]:
zs_avg = []
for index, row in xmatch.iterrows():
    zs   = str(row['z']).split(',')
    vel  = row['VEL']
    
    # turn every z value per row into a float
    item_floats = []
    for val in zs:
        val = float(val)
        item_floats += [val]
        
    # take the median of all the z values together per row    
    z_avg = np.median(item_floats)
    zs_avg += [z_avg]

Define a function that converts recessional velocity to redshift.

In [None]:
# convert recessional velocity to redshift
def vel_to_z(v):
    return(v / (3*10**5))

xmatch['z']     = zs_avg
xmatch['z_gal'] = xmatch['VEL'].apply(vel_to_z)

# take the difference between redshift of SNe and redshift of their supposed galaxies
xmatch['z_diff'] = abs(xmatch['z'] - xmatch['z_gal'])

Designate a believable sample and a doubtful sample based on if redshift and host galaxy information are missing.

In [None]:
sample_type = []

for index, row in xmatch.iterrows():
    z    = str(row['z'])
    host = str(row['HOST'])
    
    if (z == 'nan') and (host == 'nan'):
        sample_type += ['D']
    else:
        sample_type += ['B']

xmatch['SAMPLE'] = sample_type

In [None]:
xmatch = xmatch[xmatch['SAMPLE'] == 'B']
xmatch

#### Target duplicates.

Duplicates occur when there are more than one galaxy in a given field (or, in a lesser case, when there are multiple entries for one SN in the OSC). We need to get rid of duplicates by matching each SN with its correct PGC.

In [None]:
xmatch_dup = xmatch[xmatch.duplicated(subset='NAME', keep=False) == True]

To match the correct PGCs with the correct SNe, we will first use 2*R25 and draw an ellipse over each galaxy. We will see if the SNe falls within this ellipse. Output this to a file. This is our potential SNe sample with no cuts based on redshift, type, or getting rid of duplicates (though duplicates are indicated).

Get rid of everything lacking a position angle, incl, or has an incl > 60.

In [None]:
xmatch = xmatch[xmatch['PA'].isnull() | xmatch['INCL'].isnull() == False]
xmatch = xmatch[xmatch['INCL'] <= 60.]

Keep everything within 2*R25.

In [None]:
def ellipse(delta_ra, delta_dec, pa, incl, r25_rad):
    part1 = ((-delta_ra*math.cos(np.radians(pa+90)) + delta_dec*math.sin(np.radians(pa+90)))**2)/(2*r25_rad**2)
    part2 = ((-delta_ra*math.sin(np.radians(pa+90)) - delta_dec*math.cos(np.radians(pa+90)))**2)/(2*r25_rad*math.cos(np.radians(incl)))**2
    return(part1, part2)

In [None]:
def in_gal(samp):
    in_galaxy = []

    bar = ProgressBar(len(samp), ipython_widget=True)
    for index, row in samp.iterrows():

            bar.update()

            # grab ra, dec, R25, inclination, and position angle; get ra, dec, and R25 in radians
            ra_gal  = np.radians(row['RA_GAL'])
            dec_gal = np.radians(row['DEC_GAL'])
            r25     = np.radians(row['R25'])
            incl    = row['INCL']
            pa      = row['PA']

            # grab RA and DEC of SN; get them in radians
            ra_sn  = np.radians(row['RA'])
            dec_sn = np.radians(row['DEC'])

            # take difference in RA and DEC of galaxy and RA and DEC of SNe
            delta_ra = ra_gal - ra_sn
            delta_dec = dec_gal - dec_sn

#             part1, part2 = ellipse(delta_ra, delta_dec, pa, incl, r25)
            part1, part2 = ellipse(delta_ra, delta_dec, pa, incl, r25) # re-run but for those within r25
            e = part1 + part2

            # compare distance with eqtn for inclined, oblique ellipse
            if e <= 1:
                in_galaxy += [1]
            else:
                in_galaxy += [0]
    return(in_galaxy)

In [None]:
in_galaxy = in_gal(xmatch)

In [None]:
xmatch['IN_2R25'] = in_galaxy
# xmatch.to_csv('samples/potential_sample.csv', index=False)
xmatch.to_csv('samples/potential_sample_full.csv', index=False)

Drop things that are outside of 2*R25 or have a difference in redshift greater than 0.001.

In [None]:
# potential_sample = pd.read_csv('samples/potential_sample.csv')
potential_sample = pd.read_csv('samples/potential_sample_full.csv')
potential_sample = potential_sample[potential_sample['IN_2R25'] == 1]
potential_sample = potential_sample[potential_sample['z_diff'] <= 0.002]
potential_sample

Check for duplicates. (Our strict constraints seem to have weeded out any potential duplicates. If in the future there are duplicates, see sne_sample in the old directory for how to deal with these.)

In [None]:
potential_sample[potential_sample.duplicated(subset='NAME', keep=False) == True]

In [None]:
potential_sample.to_csv('samples/sne_sample_full.csv', index=False)
# potential_sample.to_csv('samples/sne_sample.csv', index=False)
potential_sample