# Benchmarking the variabilityFunctions.py 

A notebook to benchmark the functions used to find sigma_full and mu_full, trying to figure out how to make computeVarMetrics faster ?  

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 
pd.options.mode.chained_assignment = None
# to avoid  http://stackoverflow.com/questions/20625582/how-to-deal-with-this-pandas-warning

# for all imports of my functions, 
# make python aware of my packages...
import sys
sys.path.insert(0, '/astro/users/suberlak/S13Agg_analysis/packages/')

# faint source treatment 
import faintFunctions as faintF 

# variability 

import variabilityFunctions as varF
from astroML.stats import median_sigmaG
from astropy.time import Time

In [3]:
# Read in the raw lightcurve /...

DirIn = '/astro/store/scratch/tmp/suberlak/S13Agg/rawDataFPSplit/'
#'/astro/store/scratch/tmp/suberlak/s13_stripe82/forced_phot_lt_23/NCSA/Proc_no_rms/'
name = 'g176_181.csv'
fp_data = pd.read_csv(DirIn+name, nrows=100000, usecols=['objectId', 'mjd', 'psfFlux', 'psfFluxErr'])

In [4]:
#
##########  STEP 1 : single-epoch data ###########  
#

####  first drop all NaNs  in psfFlux...      
m1  = np.isnan(fp_data['psfFlux'])  # True if NaN  
m2 =  ~np.isfinite(fp_data['psfFlux']) #  True if not finite  
m  = m1 | m2  # a logical or 
if np.sum(m) > 0 :  # only apply if there is anything to drop ... 
    fp_data.drop(m.index[m], inplace=True)  # drop entire rows 
    print('Okay, we dropped %d rows where psfFlux is NaN or inf'%np.sum(m))

#### check to make sure that there are no NaN psfFluxErr... 
m1  = np.isnan(fp_data['psfFluxErr'])  # True if NaN  
m2 =  ~np.isfinite(fp_data['psfFluxErr']) #  True if not finite  
m  = m1 | m2  # a logical or 
if np.sum(m) > 0 :  # only apply if there is anything to drop ... 
    fp_data.drop(m.index[m], inplace=True)
    print('Okay, we dropped %d rows where psfFluxErr is NaN or inf'%np.sum(m))
# make a new column, fill with 0's
fp_data['flagFaint'] = 0

# mask those rows that correspond to SNR < 2
mask = (fp_data['psfFlux'].values / fp_data['psfFluxErr'].values) < 2

# print info how many points are affected
print('There are %d points of %d that have SNR<2' %(np.sum(mask),len(mask)))

# set flag at those rows to 1
fp_data.ix[mask, 'flagFaint'] = 1

# make new columns for  Mean  Median  2 sigma...
fp_data['faintMean'] = np.nan
fp_data['faintMedian'] = np.nan
fp_data['faintTwoSigma'] = np.nan
fp_data['faintRMS'] = np.nan
# calculate the faint replacement only for faint points...
fp_data.ix[mask, 'faintMean'] = faintF.calculate_mean(fp_data['psfFlux'][mask].values,fp_data['psfFluxErr'][mask].values)
fp_data.ix[mask, 'faintMedian'] = faintF.calculate_median(fp_data['psfFlux'][mask].values,fp_data['psfFluxErr'][mask].values)
fp_data.ix[mask, 'faintTwoSigma'] = faintF.calculate_2sigma(fp_data['psfFlux'][mask].values,fp_data['psfFluxErr'][mask].values)
fp_data.ix[mask, 'faintRMS'] = faintF.calculate_rms(fp_data['psfFlux'][mask].values,fp_data['psfFluxErr'][mask].values)

There are 52032 points of 100000 that have SNR<2


In [5]:
#
##########  STEP 2 : Derived Quantities ###########  
#

####  replace all psfFlux  where SNR < 2  with  faintMean  
rows = fp_data['flagFaint'] == 1
fp_data.ix[rows, 'psfFlux'] = fp_data.ix[rows, 'faintMean']

# group by objectId to calculate full LC variability characteristics 
grouped = fp_data.groupby('objectId')


In [6]:
# An average lightcurve  : 72 pts 
df = grouped.get_group(grouped.groups.keys()[1])

In [7]:
# Double that lightcurve : 144 pts 
df2 = df.append(df)

In [None]:
# N=1000 bootstraps... One lightcurve , N=72 pts
reload(varF)
%timeit varF.computeVarMetrics(df2)

In [None]:
# N=10000 bootstraps...
reload(varF)
%timeit varF.computeVarMetrics(df)

In [None]:
varMetricsFull = grouped.apply(varF.computeVarMetrics)