In [1]:
import pandas as pd
import numpy as np
import argparse
from collections import defaultdict
from scipy.stats import linregress

In [18]:
#####   Parse Input   #####
out_path = './'
drop_samples = [line.strip() for line in open('./drop.txt', 'r')] # which columns to drop based on results in qc.ipynb
control_col = 'input' # which column to use as "baseline" when calculating enrichment scores

# pd.DataFrame(index=oligoID, columns=sampleID, values=proportions of reads in each column assigned to each oligo)
proportions = pd.read_csv('./proportions.csv', index_col=0)

# drop bad replicates or superfluous samples as specified
proportions.drop(drop_samples, inplace=True, axis=1, errors='ignore')

# all columns with 'input' in the name
input_cols = [c for c in proportions.columns.values if 'input' in c.lower()]
# all columns with 'beads' in the name
beads_cols = [c for c in proportions.columns.values if 'beads' in c.lower()]

# nonnumerical columns
metadata_cols = ['virus', 'start', 'end', 'sequence', 'start', 'end', 'strains']
metadata = proportions[metadata_cols]

# all non-metadata columns
proportions = proportions[[c for c in proportions.columns.values if c not in metadata_cols]]

In [19]:
#####   Aggregate replicates    #####
def aggregate(reps, name, df=proportions):
    '''
    Average read proportion values across replicates.
    Input: replicates, name for new column, dataframe (proportions as default)
    Modifies df in place: drops non-aggregated columns in 'replicates', adds new column with averaged values
    '''
    if len(reps) == 1: # nothing to aggregate
        df[name] = df[reps[0]]
    else:
        df[name] = df[reps].mean(axis=1) # mean of each row --> new column of aggregated values
    df.drop(reps, inplace=True, axis=1, errors='ignore') # drop original columns (yes, the axis designator switches between these two methods which is incredibly annoying but is correct I promise)

technical_replicates = defaultdict(list)
# {'NHP-3-2ng': ['NHP-3-2ng-1', 'NHP-3-2ng-2']}

for serum in proportions.columns.values: # Find replicates like ['NHP-3-1', 'NHP-3-2']
    if serum in beads_cols:
        name = 'beads'
    elif serum in input_cols:
        name = 'input'
    else:
        name = serum.rsplit('_', 2)[0]
    technical_replicates[name].append(serum)

for serum, reps in technical_replicates.items():
    aggregate(proportions, reps, serum)
# now data looks like pd.DataFrame(columns=['NHP-1', 'input', 'beads', 'NHP-2', ...])
print proportions.head()

                DENV1_30dpi       EVU  DENV3_30dpi  DENV4_30dpi         H  \
id                                                                          
1                  0.005368  0.000346     0.000350     0.000340  0.000225   
100                0.000035  0.000006     0.000072     0.000122  0.000059   
1001               0.000033  0.000023     0.000083     0.000131  0.000000   
1002               0.000183  0.000719     0.000072     0.000102  0.000497   
1008.1177.1346     0.000917  0.000662     0.001098     0.000690  0.002028   

                   input         A         C         B         E    ...     \
id                                                                  ...      
1               0.001005  0.000849  0.000299  0.001010  0.000953    ...      
100             0.000230  0.000104  0.000195  0.000204  0.000086    ...      
1001            0.000165  0.000144  0.000149  0.000137  0.000155    ...      
1002            0.000107  0.000317  0.000186  0.000242  0.000391    ..

In [None]:
#####   Convert all values to fold enrichment over control  #####
mean_control_val = proportions[control_col].mean() # avoid dividing by 0
filled_control_col = proportions[control_col].replace(0., mean_control_val) # replace 0 in control col with mean value of control col
enrichment = proportions.divide(filled_control_col, axis=0) # divide each column by the control column

annotated_enrichment = enrichment.join(metadata) # reattach the metadata
print annotated_enrichment.head()
annotated_enrichment.to_csv(out_path+'enrichment.csv') # write to file