In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import pickle
import re
import gzip
import os

# 2018-06-06 Library sanity check
I reran the `bcl2fastq` program with a new sample sheet, created starting from a number-of-count analysis of how many times a given index is found in the reads. Now I want to see whether associating a given read to a given library using this way in this way actually gave a sensible result. The first thing is to load the newly processed barcode-promoter association dictionary. Reminder: in this new version, all the candidate promoters are given. If the read-library association was done correctly, I should observe that for any given read there is a clear winner, which corresponds to the library itself.

In [None]:
# load new dictionary
prom_bcd_d_fname = '/home/rcortini/work/CRG/projects/hpip/scratch/prom_bcd_dict/prom_bcd_d.p'
prom_bcd_d = pickle.load(open(prom_bcd_d_fname,'rb'))

In [None]:
datadir = '/mnt/ant-login/mcorrales/HPIP/libraries'

# Lets generate a dictionary uniquely associating barcode-promoters
data_path = datadir + "/Starcoded_proms/"
fn_re = re.compile(r"Promoter[A-H][0-9][0-9]?-starcoded.txt")
starcodedfn = [fname for fname in os.listdir(data_path) if
               fn_re.match(fname)]

# library IDs: init the barcodes dictionary
libs = [i for i in range(1,13)]
barcodesd = dict()
for lib in libs :
    barcodesd[lib] = {}

for fname in starcodedfn:
    promname = fname.split("-")[0]
    prom_lib_id = int(promname.split('Promoter')[1][1:])
    mbt.log_message('create_dict',"Processing %s"%(fname))
    with open(data_path + fname) as f:
        for line in f:
            bcd = line.split()[0]
            # Barcodes cannot be duplicated
            if bcd in barcodesd[prom_lib_id]:
                barcodesd[prom_lib_id][bcd].append(promname)
                continue
            barcodesd[prom_lib_id][bcd] = [promname]

In [None]:
# a little function that gets the promoter class and library from its name
def prom_class_and_lib(prom_name) :
    prom_id = prom_name.split('Promoter')[1]
    prom_class = prom_id[0]
    prom_lib = prom_id[1:]
    return prom_class,prom_lib

In [None]:
# generate library file names
def bcd_lib_assignment(rep,prom_bcd_d) :
    mc_datadir = '/mnt/ant-login/mcorrales/HPIP/iPCR/HPIP_iPCR_%s/Data/Intensities/BaseCalls'%(rep)
    libs = [i for i in range(1,13)]
    prom_bcd_counts = np.zeros((12,12,8),dtype=np.int32)
    prom_class_idx = {'A':0,'B':1,'C':2,'D':3,
                  'E':4,'F':5,'G':6,'H':7}
    prom_lib_idx = {lib : lib-1 for lib in libs}
    for lib in libs :
        mbt.log_message(rep,'Parsing library %d'%lib)
        iPCR_fname = '%s/iPCR%d_S%d_R1_001.fastq.gz'%(mc_datadir,lib,lib-1)
        # skip library 1, basically
        if not os.path.exists(iPCR_fname) : continue
        # open file
        with gzip.open(iPCR_fname) as f :
            lineno = 0
            for line in f :
                lineno+=1
                # get only the sequence of the read
                if lineno%4 != 2 :
                    continue
                # get barcode
                bcd = line[:20]
                # for every barcode, examine all the possibilities:
                # it can be in any of the 12 libraries
                for lib_try in libs :
                    if prom_bcd_d[lib_try].has_key(bcd) :
                        # the barcode exists in the dictionary: fetch the list
                        # of candidate promoters
                        prom_list = prom_bcd_d[lib_try][bcd]
                        for prom in prom_list :
                            prom_class,prom_lib = prom_class_and_lib(prom)
                            prom_bcd_counts[
                                lib-1,
                                prom_lib_idx[int(prom_lib)],
                                prom_class_idx[prom_class]
                            ] += 1
    return prom_bcd_counts

In [None]:
bcd_lib = {}
reps = ['rep1','rep2']
for rep in reps :
    bcd_lib[rep] = bcd_lib_assignment(rep,barcodesd)

In [None]:
reps = ['rep1','rep2']
counts_mat = {}
for rep in reps :
    counts_mat[rep] = mbt.row_normalize_matrix(bcd_lib[rep].sum(axis=2).astype(float))

In [None]:
for rep in reps :
    fig,ax = plt.subplots(1,1,figsize=(8,8))
    cax = ax.matshow(counts_mat[rep],cmap=plt.cm.Greens)
    cbar = plt.colorbar(cax)
    plt.xticks(range(12),[str(i+1) for i in range(12)])
    plt.yticks(range(12),[str(i+1) for i in range(12)])
    plt.xlabel('Barcode library assignment',fontsize=32)
    ax.xaxis.set_label_position('top')
    plt.ylabel('Library origin',fontsize=32)
    cbar.set_label('Frequency')
    plt.show()

These graphs show that there are a substantial amount of things to fix in the data set. Let's have a look at the percentage of collisions already in the promoter-barcode association library.

In [None]:
for lib_id,d in barcodesd.iteritems() :
    n = 0
    for bcd,l in d.iteritems() :
        if len(l)>1 : n+=1
    print "Library %s: %d barcodes, %.2f%% collisions"%(lib_id,len(d),n*100/float(len(d)))

The next step will be to look at the statistics of barcode coll