In [1]:
from astropy.io import fits
from astropy.table import Table

import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import sys
sys.path.append('../corv-dev/corv/src')
import corv

#to import SDSS-V data
import fsspec
import requests
import aiohttp
import os

could not find pickled WD models
/Users/vedantchandra/0_research/01_sdss5/006_build_corv/data/comm_cat/
star and exposure catalogs not found! check paths and run make_catalogs() if you want to use sdss functionality. otherwise ignore.


In [2]:
# Load in SDSS-V access tokens and the Gaia crossmatch table
with open('creds.txt') as f:
    creds = f.read().splitlines()
    
catalog = Table.read('data/SDSS_V_Gaia_xmatch.csv')

In [3]:
catalog[0]['spec_file']

'spec-101748-59846-27021597834828397.fits'

For some reason, Nicole's code for programmatically accessing SDSS-V data doesn't work for my machine. My strategy is going to be splitting the sample into batches of 200-300 spectra, analyzing each spectrum, deleting them from my computer, and downloading another batch. The emission line strategy is going to be:
1. Fit two Voight profiles with negative amplitude and two with positive amplitude
2. Plot the mean amplitude of the two positive guys
3. Look for a good bifurcation point

SDSS-V data is stored at:
https://data.sdss5.org/sas/sdsswork/bhm/boss/spectro/redux/v6_1_0/spectra


The spectra directory is organized as follows:

    - Plate number (ex. 15000/)
        - MJD (ex. 15000/59146/)
            - spectum fits file labeled by spec-PLATE-MJD-CATALOGID.fits (ex. spec-015000-59146-4375786564.fits)
            - the most recent spAll file also has a SPEC_FILE column which gives the name of the fits file

Download and open spAll-master.fits

    - This is a table that has information about objects observed in the plate and FPS programs
    
    
Confirmed accretion disk: https://data.sdss5.org/sas/sdsswork/bhm/boss/spectro/redux/v6_1_0/spectra/full/015333/59306/spec-015333-59306-5276309279.fits

In [None]:
from multiprocessing import Pool
import tqdm

from lmfit.models import Model, ConstantModel, VoigtModel

import warnings
warnings.filterwarnings("ignore")

session = requests.Session()
session.auth = (creds[0],creds[1])


def download_spectrum(rows, outfile = 'tempfiles/'):    
    # use the requests package to download the fits file of a given row quickly
    plate = row['spec_file'].split('-')[1]
    mjd = row['spec_file'].split('-')[2]
    file = row['spec_file']
    
    filepath = 'https://data.sdss5.org/sas/sdsswork/bhm/boss/spectro/redux/v6_1_0/spectra/full/{}/{}/{}'.format(plate, mjd, file)
    
    with open('tempfiles/{}'.format(row['spec_file']), 'wb') as f:
        f.write(session.get(filepath).content)
        
    file = fits.open('tempfiles/{}'.format(row['spec_file']))
    
    os.system('rm tempfiles/*')
            
    return file




    
def analysis_function(file, ii): 
    # first, get the relevant data from the file
    wl = 10**file[1].data['LOGLAM']
    fl = file[1].data['FLUX']
    ivar = file[1].data['IVAR']
    
    # continuum normalize using the function supplied by corv
    wl, fl, ivar = corv.utils.continuum_normalize(wl, fl, ivar)
    
    # outmask masks everything outside of Ha, inmask masks the very centroid of the line
    outmask = (6464 < wl) * (wl < 6664)
    inmask = (6561 < wl) * (wl < 6569)
    
    # fit the first model using only the wings of the function
    mask = outmask * ~inmask
    
    # the first model is restricted to have amplitude < 0
    model = ConstantModel() - VoigtModel(prefix = 'down')
    model.set_param_hint('downamplitude', value = 15, min = 0)
    model.set_param_hint('downsigma', value = 15, min = 0)
    model.set_param_hint('downcenter', value =  6564.61, min = 6534.61, max =  6594.61)
    
    base_result = model.fit(fl[mask], x=wl[mask], amp=5, cen=5, wid=1, nan_policy='omit')
    
    # for the second model, lock in the already-fit parameters for the first model and do not allow them to vary
    for name in (model.param_names):
        val = base_result.params[name].value
        model.set_param_hint(name, value = val, vary = False)
        
    # change the mask to fit all wavelengths around the Ha line
    mask = outmask
        
    # the second Voigt profile can have positive or negative amplitude
    model += VoigtModel(prefix = 'up')
    model.set_param_hint('upamplitude', value = 15, max = 100)
    model.set_param_hint('upsigma', value = 15, min = 0, max = 100)
    model.set_param_hint('upcenter', value =  6564.61, min =  6556.61, max =  6574.61)
    
    result = model.fit(fl[mask], x=wl[mask], amp=5, cen=5, wid=1)
    
    # this can be deleted later
    ratio = result.params['upamplitude'].value / result.params['downamplitude'].value
    amp = result.params['upamplitude'].value
    sigma = result.params['upsigma'].value
    
    # save a plot of the fitted model
    plt.figure(figsize = (10,10))
    
    plt.plot(wl[mask], fl[mask])
    plt.plot(wl[mask], result.best_fit, '-', label='best fit')
    
    plt.savefig('./plots/fits/{}'.format(ii))
    
    plt.close()
    
    return [base_result, result]
    

# run the analysis function on each row in the catalog
res = []
for i, row in tqdm.tqdm(enumerate(catalog), total = len(catalog)):
    # select the rows that make up each batch and download their spectra
    file = download_spectrum(row)
    
    # perform the analysis
    res.append(analysis_function(file, i))
    
    
    
# this is supposed to be the parallelizaiton code, but idk if it works right now
#def analyze(ii):
#    file = download_spectrum(catalog[ii])
#    return analysis_function(file, ii)
#    
#if __name__ == "__main__":
#    tasks = range(len(catalog))
#    pool = Pool(processes=4)
#    
#    mapped_values = list(tqdm.tqdm(pool.imap_unordered(analyze, tasks), total=len(tasks)))  

  1%|▍                                                                            | 80/13112 [01:51<5:58:04,  1.65s/it]

In [None]:
res

In [None]:
file = fits.open('spec-015333-59306-5276309279.fits')

ref = analysis_function(file,  100)
res.append(ref)

In [None]:
temp = res
res = np.array(temp).T

In [None]:
upamps = np.array([res[1][i].params['upamplitude'].value for i in range(len(res[0]))])
dnamps = np.array([res[1][i].params['downamplitude'].value for i in range(len(res[0]))])

upsigma = np.array([res[1][i].params['upsigma'].value for i in range(len(res[0]))])
dnsigma = np.array([res[1][i].params['downsigma'].value for i in range(len(res[0]))])

upcen = np.array([res[1][i].params['upcenter'].value for i in range(len(res[0]))])
dncen = np.array([res[1][i].params['downcenter'].value for i in range(len(res[0]))])

chisqr1 = np.array([res[0][i].redchi for i in range(len(res[0]))])
chisqr2 = np.array([res[1][i].redchi for i in range(len(res[0]))])

d_chisqr = np.array([chisqr2[i] - chisqr1[i] for i in range(len(chisqr1))])

In [None]:
ampratio = upamps / dnamps
sigratio = upsigma / dnsigma
cenratio = np.abs(upcen - dncen)

In [None]:
np.where(cenratio < 1)

In [None]:
np.where(upamps > 0)