# calculate_correlations

This notebook contains the code for calculating the correlations between each parameter and each RV/activity index for all lines in a corresponding cube and saving them into a list of correlation matrices. This notebook does include the cube itself (see 'make_cube').

In [None]:
import numpy as np
from scipy import stats

In [None]:
#make matrix of labels for the correlations betweeen each parameter and rv/activity index
#and error on correlation (e.g. 'centroid~cahk' correlation, 'centroid~cahk (correlation) err')

#get names of parameters from cube
params = np.array(cube.coords['param'])

#initialize empty 'labels' array corresponding to parameter x rv/activity index (and errors)
labels = np.empty((4,18), dtype=np.dtype('U100'))
#note the indices in the cube that correspond to duplicated activity indices (e.g. 12 and 14
#are both the he i d3 line measured using two different methods)
dupes = [[12,14],[18,20,22,24],[26,28],[30,32,34]]

#for the activity indices that have duplicates: iterate through parameters (f) and rv/activity indices (a), 
#get the names of the parameters and activity indices, and generate label
for f in range(0,8,2):
    for a in range(4):
        idxs = dupes[a]
        i = idxs[0]
        
        fname = params[f]

        #depending on which set of duplicates, get the name of the activity index
        if a == 0 or a == 2:
            aname = params[i][0:3]
        elif a == 1:
            aname = params[i][0:2]
        else:
            aname = params[i][0:5]

        #generate 'parameter~activity index' and error labels
        if a == 0:
            labels[int(f/2),int(i-8)] = fname + '~' + aname
            labels[int(f/2),int(i-7)] = fname + '~' + aname + ' err'
        elif a == 1:
            labels[int(f/2),int(i-10)] = fname + '~' + aname
            labels[int(f/2),int(i-9)] = fname + '~' + aname + ' err'
        elif a == 2:
            labels[int(f/2),int(i-16)] = fname + '~' + aname
            labels[int(f/2),int(i-15)] = fname + '~' + aname + ' err'
        else:
            labels[int(f/2),int(i-18)] = fname + '~' + aname
            labels[int(f/2),int(i-17)] = fname + '~' + aname + ' err'

#for the activity indices that DO NOT have duplicates: iterate through parameters (f) and rv/activity indices (a), 
#get the names of the parameters and activity indices, and generate label
notdupes = [[8,10],[16],[36,38]]      
                
for f in range(0,8,2):
    for a in range(3):
        idxs = notdupes[a]
        
        for i in idxs:
            #get names of parameter/activity index
            fname = params[f]
            aname = params[i]

            #generate 'parameter~activity index' and error labels
            if a == 0:
                labels[int(f/2),int(i-8)] = fname + '~' + aname
                labels[int(f/2),int(i-7)] = fname + '~' + aname + ' err'
            elif a == 1:
                labels[int(f/2),int(i-10)] = fname + '~' + aname
                labels[int(f/2),int(i-9)] = fname + '~' + aname + ' err'
            else:
                labels[int(f/2),int(i-22)] = fname + '~' + aname
                labels[int(f/2),int(i-21)] = fname + '~' + aname + ' err'

In [None]:
#loop through each line, make correlations matrix (corresponding to labels) for each line
#initialize empty list to hold ALL correlation matrices
cube_corrs = []

for l, line in enumerate(cube['line']):
    if 0 in cube[0,l,:] or 0 in cube[2,l,:] or 0 in cube[4,l,:] or 0 in cube[6,l,:]:
        corrmatrix = np.zeros((4,18))
        cube_corrs.append(corrmatrix)
        continue #this skips all lines that have NaNs/0s for any centroid/depth/fwhm/int_flux
    
    #initialize correlation matrix
    corrmatrix = np.zeros((4,18))
    
    #calculate correlation coefficients between parameters and rv/activity indices
    #first for duplicated indices
    dupes = [[12,14],[18,20,22,24],[26,28],[30,32,34]]

    #iterate through parameters (p) and rv/activity indices (a)
    for p in range(0,8,2):
        for a in range(4):
            idxs = dupes[a]
            
            #get value of parameter
            param = cube[p,l,:]
            param_err = cube[p+1,l,:]

            #get duplicated values of activity index and calculate a weighted average for each observation
            #first get duplicated values of activity index (for each observation) and calculate weight of each value
            #using errors on those values
            activitys = []
            activity_weights = []
            
            for i in idxs: #e.g. 12,14
                activity = cube[i,l,:]
                activity_err = cube[i+1,l,:]
                activity_weight = [1/(x**2) for x in activity_err]
                
                activitys.append(activity)
                activity_weights.append(activity_weight)
                
            activitys = np.array(activitys)
            activity_weights = np.array(activity_weights)
            
            #now calculate a weighted average of the duplicated activity index for each observation using the values 
            #and weights collected above
            combined_activity = []
            combined_activity_err = []
            
            for n in range(cube.shape[2]): #32, num files
                weighted_mean_numerator = 0

                for i in range(len(idxs)):
                    weighted_mean_numerator += activitys[i,n]*activity_weights[i,n]

                weighted_mean = weighted_mean_numerator/(np.sum(activity_weights))
                weighted_mean_err = 1/(np.sum(activity_weights))
                
                combined_activity.append(weighted_mean)
                combined_activity_err.append(weighted_mean_err)
            
            combined_activity = np.array(combined_activity)
            combined_activity_err = np.array(combined_activity_err)
            
            #place correlation between parameter and combined activity in the correct parameter x 
            #activity position in corrmatrix
            #here, pearson's r correlation is calculated but this can be changed to any correlation metric available
            #in scipy.stats
            i = idxs[0]
            
            if a == 0:
                corrmatrix[int(p/2),int(i-8)] = abs(stats.pearsonr(param, combined_activity)[0])
            elif a == 1:
                corrmatrix[int(p/2),int(i-10)] = abs(stats.pearsonr(param, combined_activity)[0])
            elif a == 2:
                corrmatrix[int(p/2),int(i-16)] = abs(stats.pearsonr(param, combined_activity)[0])
            else:
                corrmatrix[int(p/2),int(i-18)] = abs(stats.pearsonr(param, combined_activity)[0])
            
            #get error for each correlation by re-calculating the correlation in a uniform sampling between the 
            #upper and lower error bounds on each parameter/activity index and taking the standard deviation of that
            #distribution as error
            corrdistribution = []
            lowerparam = param - param_err
            upperparam = param + param_err
            loweractivity = combined_activity - combined_activity_err
            upperactivity = combined_activity + combined_activity_err

            nsamples = 10

            paramrange = np.linspace(lowerparam, upperparam, num=nsamples)
            actrange = np.linspace(loweractivity, upperactivity, num=nsamples)
            for j in range(nsamples):
                for k in range (nsamples):
                    corr = stats.pearsonr(paramrange[j], actrange[k])[0]
                    corrdistribution.append(corr)

            corrdistribution = np.array(corrdistribution)
            correrr = np.nanstd(corrdistribution)
            
            #place correlation errors between parameter and combined activity in the correct parameter x 
            #activity (error) position in corrmatrix
            i = idxs[0]
            
            if a == 0:
                corrmatrix[int(p/2),int(i-7)] = correrr
            elif a == 1:
                corrmatrix[int(p/2),int(i-9)] = correrr
            elif a == 2:
                corrmatrix[int(p/2),int(i-15)] = correrr
            else:
                corrmatrix[int(p/2),int(i-17)] = correrr
    
    #calculate correlation coefficients between parameters and rv/activity indices
    #now for non-duplicated indices
    notdupes = [[8,10],[16],[36,38]]  
    
    #iterate through parameters (p) and rv/activity indices (a)
    for p in range(0,8,2):
        for a in range(3):
            idxs = notdupes[a]

            for i in idxs:
                #get value of parameter and activity index
                param = cube[p,l,:]
                param_err = cube[p+1,l,:]
                activity = cube[i,l,:]
                activity_err = cube[i+1,l,:]

                #place correlation between parameter and activity in the correct parameter x 
                #activity position in corrmatrix
                if a == 0:
                    corrmatrix[int(p/2),int(i-8)] = abs(stats.pearsonr(param, activity)[0])
                elif a == 1:
                    corrmatrix[int(p/2),int(i-10)] = abs(stats.pearsonr(param, activity)[0])
                else:
                    corrmatrix[int(p/2),int(i-22)] = abs(stats.pearsonr(param, activity)[0])

                #get error for each correlation
                corrdistribution = []
                lowerparam = param - param_err
                upperparam = param + param_err
                loweractivity = activity - activity_err
                upperactivity = activity + activity_err

                nsamples = 10

                paramrange = np.linspace(lowerparam, upperparam, num=nsamples)
                actrange = np.linspace(loweractivity, upperactivity, num=nsamples)
                for j in range(nsamples):
                    for k in range (nsamples):
                        corr = stats.pearsonr(paramrange[j], actrange[k])[0]
                        corrdistribution.append(corr)

                corrdistribution = np.array(corrdistribution)
                correrr = np.nanstd(corrdistribution)
                
                #place correlation errors between parameter and activity in the correct parameter x 
                #activity (error) position in corrmatrix
                if a == 0:
                    corrmatrix[int(p/2),int(i-7)] = correrr
                elif a == 1:
                    corrmatrix[int(p/2),int(i-9)] = correrr
                else:
                    corrmatrix[int(p/2),int(i-21)] = correrr
    
    cube_corrs.append(corrmatrix)

cube_corrs = np.array(cube_corrs)