In [4]:
import numpy as np
import os
import re

In [5]:
def regex_from_segments(viral_segments):
    """
    Returns a compiled regex that finds 
    """
    segment_pairs = ''
    for viral_segment01 in viral_segments:
        for viral_segment02 in viral_segments:
            segment_pairs = segment_pairs + f'{viral_segment01}_{viral_segment02}|'
    compiled_regex = re.compile(segment_pairs[:-1])
    return compiled_regex

In [11]:
def read_arrays(data_directory, viral_segments):
    """        
    """
    d_repDir2Combinations = {}
    d_combination2Array = {}
    segments_to_regex = regex_from_segments(viral_segments)
    replicateDirs = [entry for entry in os.listdir(data_directory) 
                     if os.path.isdir(f'{data_directory}/{entry}')]
    
    print(replicateDirs)
    for repDir in replicateDirs:
        allCombinations = os.listdir(f'{data_directory}/{repDir}')
        d_repDir2Combinations[repDir] = allCombinations

        for combination in allCombinations:
            uniqueID = f'{segments_to_regex.search(combination).group()}'
            array = np.load(f'{data_directory}/{repDir}/{combination}')
            if uniqueID in d_combination2Array:
                d_combination2Array[uniqueID].append(array)
            else:
                d_combination2Array[uniqueID] = [array]

    return (d_repDir2Combinations, d_combination2Array)

In [7]:
def calculate_variances(d_arrays):
    """
    Takes a dictionary with numpy arrays and calculates the variance for each unique entry.

    Input:
        d_arrays -- {'uniqueName' : [np.arrays]}

    Return:
        d_comb2variance -- {'uniqueName' : np.array}
    
    """
    d_comb2variance = {}
    for combination, l_countTable in d_arrays.items():
        d_comb2variance[combination] = np.var(l_countTable, axis=0)
    
    return d_comb2variance

In [8]:
DIRECTORY = '/data/dessertlocal/projects/gl_iav-splash_freiburg/'  
INPUT = f'{DIRECTORY}/data/arrays/'
RESULT = f'{DIRECTORY}/results/202104/20200412'
iav_segments = ['PB2','PB1','PA','HA','NP','NA','M','NS']


In [15]:
wt_d_repDir2Combinations, wt_d_combination2Array = read_arrays(INPUT, iav_segments)

['wt1120_I', 'wt1120_II', 'wt0120']


In [16]:
wt_d_comb2variance = calculate_variances(wt_d_combination2Array)

In [17]:
print(wt_d_comb2variance)

{'HA_PA': array([[184.66666667, 201.55555556, 214.88888889, ...,  20.22222222,
         16.66666667,  40.66666667],
       [184.66666667, 201.55555556, 214.88888889, ...,  20.66666667,
         16.88888889,  40.66666667],
       [184.66666667, 201.55555556, 214.88888889, ...,  20.66666667,
         16.88888889,  40.66666667],
       ...,
       [ 14.88888889,  14.88888889,  16.22222222, ...,   1.55555556,
          2.        ,   8.66666667],
       [ 12.66666667,  12.66666667,  14.22222222, ...,   1.55555556,
          2.        ,   8.66666667],
       [146.        , 160.22222222, 168.        , ...,   6.        ,
          6.        ,  20.22222222]]), 'M_PB1': array([[20.22222222, 22.88888889, 20.66666667, ...,  4.22222222,
         2.66666667, 10.66666667],
       [20.22222222, 22.88888889, 20.66666667, ...,  4.22222222,
         2.66666667, 10.66666667],
       [24.66666667, 26.        , 24.22222222, ...,  4.22222222,
         2.66666667, 10.66666667],
       ...,
       [ 0.66666667