In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', font_scale=1.5, palette='Set2')
%matplotlib inline
from collections import defaultdict
from scipy.stats import entropy
import plotly.plotly as py
import plotly.graph_objs as go
import os
from pprint import pprint
from matplotlib import cm


In [3]:
out_path = './'

data = pd.read_csv('../experiments/2018-01-08/processed/enrichment.csv', index_col=0) # read in our data
metadata_cols = ['virus', 'start', 'end', 'sequence', 'strains'] # nonnumeric columns
sample_cols = [c for c in data.columns.values if c not in metadata_cols] # data columns

print data.head() # peek at the first few rows

                16418_CHIKVDay0_20ug  16418_CHIKVDay35_20ug  \
id                                                            
1                           0.664517               0.537128   
100                         0.925310               0.468594   
1001                        0.734245               0.197732   
1002                        2.405997               1.363379   
1008.1177.1346              0.640381               0.497251   

                24961_ZIKVDay28_20ug  25147_ZIKVDay28_20ug  \
id                                                           
1                           0.310872              0.357264   
100                         0.199741              0.657030   
1001                        0.000000              0.295953   
1002                        3.077846              1.176111   
1008.1177.1346              0.384816              0.503280   

                25421_ZIKVDay28_20ug  26021_ZIKVDay0_20ug  \
id                                                          
1

In [None]:
### Look at the distribution of enrichment scores in each column

def calc_percentiles(series):
    N = float(len(series))
    def calc_value_percentile(x):
        values_below = series < x
        N_values_below = float(values_below.sum())
        return N_values_below / N
    
    return series.map(calc_value_percentile)

enrichment_percentiles = enrichment[sample_cols].apply(calc_percentiles, axis=0)

def plot_enrichment_percentiles(serum):
    
    autologous = []
    
    for virus in enrichment['virus']:
        if virus in serum: 
            autologous.append(virus)
        else:
            autologous.append('Other')
    
    values = {'x': enrichment[serum],
              'y': enrichment_percentiles[serum],
              'hue': autologous}
    
    values = pd.DataFrame(values)
    
    ax = sns.lmplot('x', 'y', values, hue='hue', fit_reg=False, scatter_kws={'alpha': 0.6})
    ax._legend.set_title('Epitope')

    plt.xlim((0,150))
    plt.ylabel('Score percentile')
    plt.xlabel('Enrichment score')
    plt.title(serum)
    plt.show()
    plt.close()
    
for serum in sample_cols:
    plot_enrichment_percentiles(serum)

In [4]:
### Aggregate enrichment by site 
### (average enrichment scores across all oligos that include each site in a given virus's genome)

def find_all_oligos(virus, site, data=data):
    '''
    find all the integer indices 
    in the dataframe that correspond to oligos 
    containing that site
    '''
    indices = np.where((data['virus']==virus) &
                 (data['start'] <= site) & 
                 (site <= data['end']))
    return indices[0]
    
def aggregate_site(virus, site, data):
    '''Return a series of the mean value of each column for all oligos containing the input site'''
    indices = find_all_oligos(virus, site, data)
    entries = data.iloc[indices]
    agg = entries.mean(axis=0)
    return agg
        
def aggregate_virus_sites(virus, data=data, path=None):
    ''' For all sites in the viral genome in the dataset, fetch the aggregated values of all oligos containing that site'''
    if path and os.path.isfile(path):
        return pd.read_csv(path, index_col=0)

    first_site = data.loc[data['virus'] == virus]['start'].min()
    last_site = data.loc[data['virus']==virus]['end'].max()

    sites = range(first_site, last_site)
    aggregated_sites = { site: aggregate_site(virus, site, data) for site in sites }

    df = pd.DataFrame.from_dict(aggregated_sites, orient='index')
    df.to_csv(out_path + virus + '_sitewise_enrichment.csv')
    return df

try: # try to load from file if we've already calculate sitewise enrichment
    site_maps = { virus: pd.read_csv('./sitewise_enrichment/%s_sitewise_enrichment.csv'%virus, index_col=0) 
                for virus in pd.unique(data['virus'])}
except:
    site_maps = {virus: aggregate_virus_sites(virus, path='./sitewise_enrichment/%s_sitewise_enrichment.csv'%virus)
                 for virus in pd.unique(data['virus'])}
    for virus, df in site_maps.items():
        df.to_csv('./sitewise_enrichment/%s_sitewise_enrichment.csv'%virus)

In [None]:
def plot_interactive_binding_footprints(virus):
    ''' Generate plot of virus genome position(x) vs. enrichment score (y) for each serum sample.
        Send to plot.ly for interactive exploration.'''
    values = site_maps[virus]
    traces = []
    
    for serum in sample_cols:
        if serum in ['input', 'beads']:
            continue
            
        vals = values[serum]
        
        trace = go.Scatter(
            x = vals.index.values,
            y = vals.values,
            mode = 'lines',
            name = serum)
        
        traces.append(trace)
    
    layout = dict(title = 'Oligos from %s'%virus,
              xaxis = dict(title = 'Genomic position'),
              yaxis = dict(title = 'Fold enrichment by sera'),
              )
    
    py.iplot(traces, filename='2018-01-08_'+virus)


for virus in flavis:
    plot_interactive_binding_footprints(virus)