In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
from scipy.stats import gaussian_kde
import os, sys

# 2018-06-08 Barcode collisions
We need to find out whether there are any ways to get rid of the many, many collisions between the barcodes in a given library. Therefore, I'll set up some code to study whether in the barcodes that are found to be colliding, there are clear patterns emerging. For example, if one barcode is found 95% of the times to be associated to a given promoter, and 5% of the times to the other, then we can consider that the first one is the clear winner. If not, then we're in trouble.

In [None]:
# build file names
datadir = '/mnt/ant-login/mcorrales/HPIP/libraries/Starcoded_proms'
lib = 5
prom_classes = ['A','B','C','D','E','F','G','H']

# parse files
prom_bcd_dict = {}
bcd_lib_counts = {}
for prom_class in prom_classes :
    prom_name = 'Promoter%s%d'%(prom_class,lib)
    fname = '%s/%s-starcoded.txt'%(datadir,prom_name)
    mbt.log_message('create_dict',"Processing %s"%(fname))
    counts = 0
    with open(fname) as f:
        for line in f:
            bcd,bcd_counts,_ = line.split()
            bcd_counts = int(bcd_counts)
            counts += bcd_counts
            bcd_info = (prom_class,bcd_counts)
            if bcd in prom_bcd_dict :
                prom_bcd_dict[bcd].append(bcd_info)
            else :
                prom_bcd_dict[bcd] = [bcd_info]
    bcd_lib_counts[prom_class] = counts

In [None]:
nbcds = 100
n = 0
max_rel_ps = []
for bcd,prom_list in prom_bcd_dict.iteritems() :
    n += 1
    if len(prom_list) > 1 :
        ps = []
        for prom_class,bcd_counts in prom_list :
            ps.append(bcd_counts/bcd_lib_counts[prom_class])
        tot_p = sum(ps)
        rel_ps = [p/tot_p for p in ps]
        max_rel_ps.append(max(rel_ps))
        for i in range(len(ps)):
            prom_class,bcd_counts = prom_list[i]
            
    # for some visual output
    if n < nbcds :
        print bcd
        for i in range(len(ps)) :
            print "\tclass: %s, percentage: %.2e, relative p: %.2f"%(prom_class,ps[i],rel_ps[i])
max_rel_ps = np.array(max_rel_ps)

In [None]:
k_max_rel_ps = gaussian_kde(max_rel_ps)

In [None]:
x = np.arange(0.,1.01,0.01)
plt.plot(x,k_max_rel_ps(x))
plt.xlabel('Maximum frequency')
plt.ylabel('Distribution')
plt.show()

The results of this analysis show that most of the barcodes have a relatively high maximum value of the relative frequency. However, there are more than 40% that have a value of the maximum probability which is less than 90%. So for each of these barcodes we should actually store the value of the probability, and we should probabilistically assign each barcode to a promoter. This way we won't throw away any of the reads in the mapping experiment. The question is how to store this huge amount of information.

## A new dictionary
Here I want to write the code that will create a new dictionary that we will access this way: we'll do
```python
# print information on a barcode
proms = prom_bcd_dict[bcd]
for prom_lib,prom_candidates in proms :
    print "Candidates from lib %d"%prom_lib
    for prom_candidate,probability in prom_candidates :
        print "\tCandidate %s has probability %.2f"%(prom_candidate,probability)
```
Now I'll draft the code that will create this data structure.

In [None]:
# STEP 1: parse all the files and collect all the barcodes
datadir = '/mnt/ant-login/mcorrales/HPIP/libraries/Starcoded_proms'
libs = range(1,13)
prom_classes = ['A','B','C','D','E','F','G','H']
prom_bcd_dict = {}
bcd_lib_counts = {}
for lib in libs :
    d = {}
    for prom_class in prom_classes :
        prom_name = 'Promoter%s%d'%(prom_class,lib)
        fname = '%s/%s-starcoded.txt'%(datadir,prom_name)
        mbt.log_message('lib %d'%lib,"Processing %s"%(fname))
        counts = 0
        if not os.path.exists(fname) :
            continue
        with open(fname) as f:
            for lineno,line in enumerate(f):
                bcd,bcd_counts,_ = line.split()
                bcd_counts = int(bcd_counts)
                counts += bcd_counts
                bcd_info = (prom_class,bcd_counts)
                if bcd in d :
                    d[bcd].append(bcd_info)
                else :
                    d[bcd] = [bcd_info]
                # TODO: remove this for the full monty
                # if lineno > 10000 : break
        bcd_lib_counts['%s%d'%(prom_class,lib)] = counts
    prom_bcd_dict[lib] = d

In [None]:
def calculate_rel_ps(prom_list, lib, bcd_lib_counts) :
    # compute the relative probabilities within the library
    ps = []
    for prom_class,bcd_count in prom_list :
        ps.append(bcd_count/bcd_lib_counts['%s%d'%(prom_class,lib)])
    tot_p = sum(ps)
    rel_ps = [p/tot_p for p in ps]
    return rel_ps

In [None]:
def string_lib(prom_list,prom_lib,rel_ps) :
    string = ''
    for i in range(len(prom_list)) :
        prom_class,_ = prom_list[i]
        p = rel_ps[i]
        if i>0 : string += ','
        string += '%s%d:%.3f'%(prom_class,prom_lib,p)
    return string

In [None]:
%%time
# STEP 2: compute the relative probabilities and output a file
prom_bcd_fname = 'test3.txt'
with open(prom_bcd_fname,'w') as f :
    # iterate through all the libraries
    for lib,d in prom_bcd_dict.iteritems() :
        # n = 0
        for bcd,prom_list in d.iteritems() :
            rel_ps = calculate_rel_ps(prom_list,lib,bcd_lib_counts)
            line = '%s\t'%(bcd)
            line += string_lib(prom_list,lib,rel_ps)
            for lib_try in libs[lib+1:] :
                if bcd in prom_bcd_dict[lib_try] :
                    prom_list_lib_try = prom_bcd_dict[lib_try][bcd]
                    rel_ps = calculate_rel_ps(prom_list_lib_try,lib_try,bcd_lib_counts)
                    line += ';'
                    line += string_lib(prom_list_lib_try,lib_try,rel_ps)
                    # important: we now need to remove the barcode from the lib_try library,
                    # because otherwise we will find it again when we will parse that library,
                    # resulting in multiple entries in the output file
                    del prom_bcd_dict[lib_try][bcd]
            line += '\n'
            f.write(line)
            # n += 1
            # if n > 1000 : break

In [None]:
# STEP 3 : create a parser for the output file
def parse_prom_bcd_dict(prom_bcd_fname) :
    d = {}
    with open(prom_bcd_fname,'r') as f :
        for lineno,line in enumerate(f) :
            bcd,all_candidates = line.strip().split('\t')
            d_bcd = {}
            for candidates in all_candidates.split(';') :
                for candidate in candidates.split(',') :
                    prom_name,p = candidate.split(':')
                    d_bcd[prom_name] = p
            d[bcd] = d_bcd
            if lineno > 1000 : return d
pbd = parse_prom_bcd_dict(prom_bcd_fname)

In [None]:
sys.getsizeof(pbd)