In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import argparse
from scipy.stats import linregress
from collections import defaultdict

In [None]:
######### Input ########

## Read in data; use the ID to index individual oligos
counts = pd.read_excel('./2018.03.23.annotatedCounts.xlsx', index_col=0)
out_path = './'

drop_columns = [] # put any columns to drop here (e.g., other peoples' samples)
counts.drop(drop_columns, axis=1, inplace=True)

# We know these animals were previously vaccinated with HIV antigen; drop those oligos
hiv_oligos = counts.loc[counts['Virus_Strain'].str.contains('HIV')]
counts.drop(hiv_oligos.index.values, axis=0, inplace=True)

In [None]:
#########  Initial cleanup & normalization #########

# For convenience, separate out the metadata, input counts, background counts and metadata.
def proportions(df):
    ''' For each column, divide each element by the sum of the column (column sums to 1)'''
    xsum=df.sum(0)
    df = df.div(xsum, axis='columns')
    return df

input_cols = [ c for c in counts.columns.values if 'input' in c.lower() ]
beads_cols = [ c for c in counts.columns.values if 'beads' in c.lower() ]
metadata_cols = ['Virus_Strain', 'Start_to_End_nt', 'Peptide_sequence']
sample_cols = [c for c in counts if not any([c in input_cols, c in beads_cols, c in metadata_cols])]

metadata = counts[metadata_cols]
# Standardize each column to sum to 1
values = proportions(counts[[c for c in counts.columns.values if c not in metadata_cols]])
input_ctrls = values[input_cols]
beads_ctrls = values[beads_cols]

print values.head() # this is what our sample data looks like now:

In [None]:
sns.set(style='whitegrid', font_scale = 1.3, palette='Set3') ## Make all of our plots prettier
def compare_replicates(columns=[], df=values, title=None, fname=None):
    ''' Plot sanity checks for technical replicates. Input:
        * a list of columns for pairwise comparisons (if not provided, does all pairwise comparisons)
        * dataframe (default `values`)
        * plot title (optional)
        * output file name for plot (optional)'''
    if len(columns) == 0:
        replicates = df # all columns
    elif len(columns) == 1:
        return # only one column, no comparisons to make
    else:
        replicates = df[columns]
    replicates.fillna(0, inplace=True)
    
    def plot_comparison(x,y, **kwargs):
        ''' Scatter plot'''
        scatter = plt.plot(x,y, 'o', alpha=0.4)
        try:
            r_2 = linregress(x,y)[2]
            scatter[0].axes.text(0,0, 'R^2 = %.2f'%(r_2))
        except:
            pass
        return scatter

    g = sns.PairGrid(replicates, diag_sharey=False)
    g.map_diag(sns.violinplot) # plot violinplots of each sample's distribution on the diagonal
    g.map_offdiag(plot_comparison, ) # scatterplots to compare samples off the diagonal

    if title:
        g.fig.suptitle(title, va='bottom')
    plt.tight_layout()
    
    if fname:
        plt.savefig(out_path+'/figs/'+fname, bbox_inches='tight')
        
    plt.show()
    plt.close()

In [None]:
###### Compare read proportions across controls #####
compare_replicates(input_cols + beads_cols, values, 'Input + Beads', 'ctrl_reps.png')

In [None]:
######  Find technical replicates, plot direct comparisons #########
technical_replicates = defaultdict(list) # {'sample': [sample_rep1, sample_rep2]}

for c in sample_cols:
    technical_replicates[c.split('_')[0]].append(c)

technical_replicates = dict(technical_replicates) # Turn off defaultdict behavior

for serum, tech_reps in technical_replicates.items():
    compare_replicates(tech_reps, values, serum)

In [None]:
## Does higher sera concentration == more reads? 
## Expects concentration annotated as sample_20ug_1, where 20ug is the concentration and 1 is the technical replicate number

concentration_replicates = defaultdict(list) ## group individual replicates by sample and concentration
for c in sample_cols: 
    concentration_replicates[c.rsplit('_',1)[0]].append(counts[c])
    
## Aggregate technical replicates at each concentration
concentration_counts = { sample: pd.concat(replicates, axis=1).mean(axis=1) 
                        for sample, replicates in concentration_replicates.items()}

## Now group by sample
sample_concentrations = defaultdict(list)
for sample_ug, mean_counts in concentration_counts.items():
    sample, ug = sample_ug.rsplit('_', 1)
    mean_counts.name = ug
    sample_concentrations[sample].append(mean_counts)
    
sample_concentrations = { sample: pd.concat(replicates, axis=1) 
                         for sample, replicates in sample_concentrations.items()}

for sample, readcounts in sample_concentrations.items():
    readcounts = readcounts.melt(var_name='Concentration', value_name='Mean read counts')
    ax = sns.violinplot(x='Concentration', y='Mean read counts', data=readcounts, cut=0)
    ax.set_title(sample)
    plt.yscale('log')
    plt.show()

In [None]:
#######  Tidy up the metadata a bit  ########
# Tidy start and end coordinates --> integers
metadata['start'], metadata['end'] = metadata['Start_to_End_nt'].str.split('to', 1).str
metadata['start'] = metadata['start'].map(lambda x: int(x))
metadata['end'] = metadata['end'].map(lambda x: int(x.split('.')[0])) ## TODO
metadata.drop('Start_to_End_nt', inplace=True, axis=1)

# Tidy up virus and strain names
def parse_strains(virusstrain):
    # e.g., 'DENV3_BR-BID-V2403-2008.DENV3_Mozambique1985'
    # The ONNV sequences overlap with the CHIKV sequences; for now, we'll omit it; this should be revisited. TODO

    names = [s for s in virusstrain.split('.') if 'ONNV' not in s] # ['DENV3_BR-BID-V2403-2008', 'DENV3_Mozambique1985']
    virus = [s.split('_', 1)[0] for s in names] # ['DENV3']

    if len(set(virus)) != 1:
        virus, strains = np.nan, np.nan

    else:
        virus = virus[0] # 'DENV3'
        strains = [s.split(virus+'_', 1)[1].replace('-', '').replace('_', '').upper() for s in names if s != '']
        # ['BRBIDV24032008', 'MOZAMBIQUE1985']

    return pd.Series({'virus': virus, 'strains':strains})

new_names = [parse_strains(v) for v in metadata['Virus_Strain']]
metadata['virus'] = [n['virus'] for n in new_names]
metadata['strains'] = [n['strains'] for n in new_names]
metadata.rename(columns={'Peptide_sequence':'sequence'}, inplace=True)
metadata.drop('Virus_Strain', axis=1, inplace=True)
metadata.dropna(how='any', inplace=True, axis=(0, 1))

In [None]:
#####   Write to file   #####
values = values.join(metadata, how='inner')
values.to_csv(out_path+'proportions.csv')

counts = counts[sample_cols+beads_cols+input_cols].join(metadata, how='inner')
counts.to_csv(out_path+'counts.csv')

#### Put the names of any samples that should be dropped here
master_drop_list = []
if len(master_drop_list):
    drop = open('./drop.txt', 'w')
    for sample in master_drop_list:
        drop.write(sample+'\n')
    drop.close()