In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os, sys, gzip
import hpiptools as ht
import mybiotools as mbt

# 2018-06-21 Preparing "collect_integrations"

In this notebook I want to prepare the final part of the pipeline, which consists in two separate steps:

1. Collect the mapping and counting information from each library. This is an easy step, in which I only need to parse relatively easy files.
2. Assign a promoter to each barcode. This is the difficult step. To do this, I need to harness all the information that I have, which currently is:
    - what does the promoter-barcode dictionary says
    - what iPCR, cDNA, and gDNA index say

The latter step is the most complicated, because potentially the first assignment of the read to a particular library, based on the index, were not done correctly. Therefore, I want to store all the information that I possibly have into a data structure that will allow to have an idea of how to assign the promoter in the best possible way.

In [None]:
# HPIP root directory
hpip_root = '%s/work/CRG/projects/hpip'%(os.getenv('HOME'))

# prepare the names of the libraries
libs = ['lib%d'%(i) for i in range(2, 13)]
libs.append('undetermined')

In [None]:
class Replicate :
    def __init__(self, rep_name) :
        self.rep_name = rep_name

In [None]:
rep_names = ['rep1', 'rep2']
reps = []
for rep_name in rep_names :
    reps.append(Replicate(rep_name))

In [None]:
for rep in reps :
    # now we parse ALL the starcoded cDNA and gDNA output files
    cDNA_canonicals = {}
    cDNA_counts = {}
    gDNA_canonicals = {}
    gDNA_counts = {}
    for lib in libs :
        libdir = '%s/data/triplibs/%s/%s'%(hpip_root, rep.rep_name, lib)
        cDNA_starcode_fname = '%s/cDNA-starcode.txt'%(libdir)
        gDNA_starcode_fname = '%s/gDNA-starcode.txt'%(libdir)
        if not os.path.exists(cDNA_starcode_fname) or\
           not os.path.exists(gDNA_starcode_fname) :
                continue
        ht.log_message(rep.rep_name, 'Parsing %s starcode'%(lib))
        cDNA_canonicals[lib], cDNA_counts[lib] =\
                   ht.parse_starcode(cDNA_starcode_fname)
        gDNA_canonicals[lib], gDNA_counts[lib] =\
                   ht.parse_starcode(gDNA_starcode_fname)
    
    # assign the structure to the replicate
    rep.cDNA_canonicals = cDNA_canonicals
    rep.cDNA_counts = cDNA_counts
    rep.gDNA_canonicals = gDNA_canonicals
    rep.gDNA_counts = gDNA_counts    

In [None]:
for rep in reps :
    iPCR_canonicals = {}
    iPCR_counts = {}
    mapped = {}
    for lib in libs :
        # data directory for the library under study
        lib_dir = '%s/data/triplibs/%s/%s'%(hpip_root, rep.rep_name, lib)

        # build file names
        iPCR_starcode_fname = '%s/iPCR-starcode.txt'%(lib_dir)
        iPCR_sam_fname = '%s/iPCR.sam'%(lib_dir)

        # work on the starcoded files
        ht.log_message(rep.rep_name, 'Processing %s iPCR starcode'%(lib))
        iPCR_canonicals[lib], iPCR_counts[lib] =\
                  ht.parse_starcode(iPCR_starcode_fname)
            
        # now we can open the mapped file
        ht.log_message(rep.rep_name, 'Processing %s mapped file'%(lib))
        mapped[lib], N, nmapped = ht.parse_mapped(iPCR_sam_fname)
    rep.iPCR_canonicals = iPCR_canonicals
    rep.iPCR_counts = iPCR_counts
    rep.mapped = mapped

In [None]:
nlibs = len(libs)
for rep in reps :
    
    rep.cDNA_transitions = np.zeros((nlibs, nlibs), dtype=np.int32)
    rep.gDNA_transitions = np.zeros((nlibs, nlibs), dtype=np.int32)
    
    # go through all the libraries
    for i, lib_origin in enumerate(libs) :
        
        ht.log_message(rep.rep_name, 'Processing %s'%(lib_origin))
        
        # go through all the integrations in that library
        for bcd, integrations in rep.mapped[lib_origin].iteritems() :
            try :
                canonical = rep.iPCR_canonicals[lib_origin][bcd]
            except KeyError :
                # this occurs if a barcode was removed from the output
                # sequences of starcode because of ambiguous cluster 
                # assignment.
                continue

            # now we try to find a match between the canonical and the cDNA
            # and gDNA canonicals in ALL other libraries
            for j, lib in enumerate(libs) :
                if rep.cDNA_canonicals[lib].has_key(canonical) :
                    rep.cDNA_transitions[i,j] += 1
                if rep.gDNA_canonicals[lib].has_key(canonical) :
                    rep.gDNA_transitions[i,j] += 1

In [None]:
# figure preparation
labels = [s[:5] for s in libs]
for rep in reps :
    fig, axes = plt.subplots(2, 1, figsize=(8, 16))
    x = range(len(libs))

    # cDNA
    ax = axes[0]
    ax.matshow(mbt.row_normalize_matrix(rep.cDNA_transitions.astype('float')))
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=90)
    ax.set_yticks(x)
    ax.set_yticklabels(labels)
    ax.set_ylabel('cDNA')
    ax.set_title(rep.rep_name, y=1.1, fontsize=32)

    # gDNA
    ax = axes[1]
    ax.matshow(mbt.row_normalize_matrix(rep.gDNA_transitions.astype('float')))
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=90)
    ax.set_yticks(x)
    ax.set_yticklabels(labels)
    ax.set_ylabel('gDNA')
    
    plt.show()

## Assigning the promoter to each integration

In [None]:
# init promoter-barcode dictionary
pbd = ht.PBD(hpip_root)

In [None]:
prom_libs = range(1,13)
prom_class_idx = {'A':0,'B':1,'C':2,'D':3,
              'E':4,'F':5,'G':6,'H':7}
prom_lib_idx = {str(prom_lib) : prom_lib-1 for prom_lib in prom_libs}

In [None]:
promoters = np.zeros((8,12))
n = 0
for bcd in mapped.iterkeys() :
    n += 1
    candidates = pbd.findbcd(bcd)
    if candidates is None :
        continue
    candidates = ht.parse_bcd(candidates)
    for candidate in candidates.iterkeys() :
        prom_class, prom_lib = ht.prom_id(candidate)
        promoters[prom_class_idx[prom_class], prom_lib_idx[prom_lib]] += 1

In [None]:
cax = plt.matshow(np.log10(promoters+1))
plt.xticks(prom_lib_idx.values(), prom_lib_idx.keys())
plt.yticks(prom_class_idx.values(), prom_class_idx.keys())
plt.colorbar(cax)
plt.show()

In [None]:
plt.bar(range(12), np.log10(promoters.sum(axis=0)))
plt.xticks(prom_lib_idx.values(), prom_lib_idx.keys())
plt.xlabel('Library ID')
plt.ylabel('Log_10 Counts')
plt.show()