In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import hpiptools as ht
from numpy.lib.recfunctions import append_fields

# 2018-05-15 Post processing pipeline
After the very first preliminary analysis of the output of the HPIP pipeline, I want to move on to another basic question. That is: are there any very specific patterns that are already evident in how the promoters behave, as a function of the promoter class? To do this analysis, first I need to get a way of analysing the promoter classes. That's to say: how do I map "H5" to the promoter patterns? I need to dig into the files that Marc produced to figure this out.

In [None]:
# "mc_" stands for "Marc Corrales"
mc_hpip_root_dir = '/mnt/ant-login/mcorrales/HPIP'
promoter_table_fname = '%s/Doc/Dockerdata_hpip/hpip/100_sampled_proms.tsv'%(mc_hpip_root_dir)
# let's parse this file
promoter_table_dtype = [
    ('chr','S4'),
    ('start',np.int32),
    ('end',np.int32),
    ('strand','S2'),
]
for i in xrange(ht.nclasses) :
    promoter_table_dtype.append(('class_%d'%(i+1),np.int32))
promoter_table_dtype.append(('color','S16'))
promoter_table_dtype.append(('class',np.int32))
promoter_table = np.genfromtxt(promoter_table_fname,dtype=promoter_table_dtype)

I very quickly realize that answering any of the interesting questions will require putting the data into a data structure that is much more convenient than the "HPIPMatrix" that I defined in the previous notebook. The problem is that the mapping of the promoter name assigned in the "integrations" file is not obviously related to the number of promoter and its class as defined in the "promoter_table" that I just parsed. The results will be very difficult to analyze if we provide an intermediate structure that does the mapping between the two names. It is therefore more sensible to start developing a pipeline that does the post-processing analysis of the data, so that the analysis will later be much easier.

## Structure of the post-processing pipeline

1. **GET**: parse the results of the HPIP pipeline
2. **NAME**: restore the original name of the promoters, which will allow to get its class with a simple arithmetic operation
3. **FILTER**: remove reads that map to weird chromosomes, remove collision barcodes
4. **SORT**: put all of these reads in order, by promoter and by chromosome
5. **NORMALIZE**: use the spikes and other magic to normalize the expression of the integration by the number of reads in the gDNA and so on.

I can start writing a procedural code to do this, wondering whether it will be sensible in the future to write it object-oriented.

In [None]:
def filter_and_name_insertion(insertion,
                          chromosome_list=['2L','2R','3L','3R','4','X','Y']) :
    p = insertion['promoter']
    if p == 'Colision' :
        return None
    else :
        if insertion['chr'] not in chromosome_list :
            return None
        else :
            insertion['promoter'] = ht.plate_to_prom[p]
            return insertion

In [None]:
# NORMALIZE
def normalize_insertions(insertions) :
    # we first get the sum of all the reads of mRNA and gDNA, per replicate
    reps = np.unique(insertions['rep'])
    mRNA_sum = {}
    gDNA_sum = {}
    for rep in reps :
        # get replicate-specific insertions
        rep_mask = insertions['rep']==rep
        # get sum of mRNA, sum of gDNA, and average mRNA of the replicate
        mRNA_sum[rep] = insertions[rep_mask]['mRNA'].sum()
        gDNA_sum[rep] = insertions[rep_mask]['gDNA'].sum()
    # then we initialize the "normalized" array, and fill it with the stuff
    # that was already contained in the passed array
    normalized_dtype = [
        ('barcode','S32'),
        ('chr','S4'),
        ('coord',np.int32),
        ('strand','S2'),
        ('promoter',np.int32),
        ('rep','S8'),
        ('expression',float)
    ]
    normalized = np.zeros(insertions.size,dtype=np.dtype(normalized_dtype))
    keep = ['barcode','chr','coord','strand','promoter','rep']
    for param in keep :
        normalized[param] = insertions[param]
    # finally, we normalize the expression
    nan_mask = insertions['gDNA'] == 0
    # insertions with zero gDNA counts have NAN expression
    expression = np.zeros(insertions.size)
    expression[nan_mask] = np.nan
    # the others, we do log(eps/<eps>), specific for each replicate, where
    # eps = (mRNA/mRNA_sum)/(gDNA/gDNA_sum)
    for rep in reps :
        rep_mask = insertions['rep']==rep
        mask = np.logical_and(~nan_mask,rep_mask)
        eps = (insertions[mask]['mRNA']/float(mRNA_sum[rep]))/\
              (insertions[mask]['gDNA']/float(gDNA_sum[rep]))
        eps_mean = eps.mean()
        expression[mask] = np.log2(eps/eps_mean)
    normalized['expression'] = expression
    return normalized

In [None]:
# GET
hpip_root_dir = '%s/work/CRG/projects/hpip'%(os.getenv('HOME'))
production_dir = '%s/production'%(hpip_root_dir)
reps = []
for rep_name in ['rep1','rep2'] :
    rep_fname = '%s/%s/HPIP_iPCR_%s_insertions.txt'%(production_dir,rep_name,rep_name)
    rep = ht.load_hpip_results(rep_fname,rep_name)
    reps.append(rep)
    
# MERGE
merged = np.concatenate([r for r in reps])

# SORT
merged.sort(order=['chr','coord'])

# FILTER AND NAME
filtered = []
for insertion in merged :
    f = filter_and_name_insertion(insertion)
    if f is not None :
        filtered.append(f)
filtered = np.array(filtered)        
# no need of storing the intermediate structure
del merged

In [None]:
normalized = normalize_insertions(filtered)

This works. What remains to do is to decide what to do with insertions that have zero gDNA counts. I'll now translate all this code to a separate file that will then work to do all of this reproducibly.

In [None]:
# a plain, democratic text file
out_fname = '%s/HPIP_results.txt'%(production_dir)
with open(out_fname, 'w') as f :
    f.write('# barcode chromosome strand coordinate promoter replicate expression\n')
    for insertion in normalized :
        f.write('%s\t%s\t%s\t%d\t%s\t%s\t%f\n'%(
            insertion['barcode'],
            insertion['chr'],
            insertion['strand'],
            insertion['coord'],
            insertion['promoter'],
            insertion['rep'],
            insertion['expression']
        ))

# and a npy file for the Python enthusiasts
out_fname = '%s/HPIP_results.npy'%(production_dir)
np.save(out_fname,normalized)

In [None]:
mock_dtype = [
    ('x','S2'),
    ('y',float)
]
N = 10
A = np.zeros(N,dtype=mock_dtype)
A['y'] = 10
print A
np.random.seed(1233)
mask = np.random.randint(2,size=N).astype(bool)
print mask
A[mask]['y'] = 3
print A
y = np.zeros(N)
y[mask] = 3
print y