In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import hpiplib

# 2018-05-14 Preliminary analysis

Once the pipeline of the HPIP experiments finally works, we can start performing some analysis on its results. In this notebook I want to start setting up the variables, functions, and data structures that will allow me to perform statistical analysis of the results in a smooth and meaningful way. As usual, I will use this tool as a scratch book to test functions and all the rest. Once I'm happy with how things look, I'll migrate the results into a convenient Python library, which I will then hopefully be able to use later.

In [None]:
# I start with defining file names and paths
hpip_root_dir = '%s/work/CRG/projects/hpip'%(os.getenv('HOME'))
production_dir = '%s/production'%(hpip_root_dir)
tests_dir = '%s/tests'%(hpip_root_dir)
rep_names = ['rep1','rep2']
rep_fnames = {}
for rep_name in rep_names :
    rep_fname = '%s/%s/HPIP_iPCR_%s_insertions.txt'%(production_dir,rep_name,rep_name)
    rep_fnames[rep_name] = rep_fname
    if os.path.exists(rep_fname) :
        print rep_fname

Now it's time to parse! I'll see if I can use the good old parser for the TRIP results.

In [None]:
# load the reporter data
def load_hpip_results(fname) :
    hpip_dtype = [
        ('barcode','S32'),
        ('chr','S32'),
        ('strand','S2'),
        ('coord',np.int32),
        ('mRNA',np.int32),
        ('promoter','S32'),
        ('cDNA',np.int32),
        ('gDNA',np.int32)
    ]
    return np.genfromtxt(fname, dtype=np.dtype(hpip_dtype))

I'll test this function with a small file so that I can debug easily.

In [None]:
toy_insertions_fname = '%s/pipeline_with_make/Toy_iPCR_rep1_insertions.txt'%(tests_dir)
toy = load_hpip_results(toy_insertions_fname)

In [None]:
toy

Okay, this function works. Now I can get the data for the real experiments, the two replicates. 

## Data structures

It is useful at this point to get the data into nice structure.

In [None]:
class HPIPReplicate :
    def __init__(self,name,
                 hpip_root_dir = '%s/work/CRG/projects/hpip'%(os.getenv('HOME'))) :
        self.name = name
        production_dir = '%s/production'%(hpip_root_dir)
        fname = '%s/%s/HPIP_iPCR_%s_insertions.txt'%(production_dir,name,name)
        if os.path.exists(rep_fname) :
            self.data = load_hpip_results(fname)

In [None]:
rep_names = ['rep1','rep2']
reps = []
for rep_name in rep_names :
    reps.append(HPIPReplicate(rep_name))

In [None]:
class HPIPMatrix :
    def __init__(self,dtype=np.int32) :
        self.rows = ['A','B','C','D','E','F','G','H']
        self.columns = range(1,13)
        self.libraries = ['Promoter%s%d'%(l,n)
                    for l in self.rows
                    for n in self.columns]
        self.M = np.zeros((len(self.rows),len(self.columns)),dtype=dtype)
        self.C = np.zeros(1,dtype=dtype)
        # internal lightweight dictionary to map names of promoters to matrix
        # elements
        self._prom_to_idx = {}
        for i,row in enumerate(self.rows) :
            for j,column in enumerate(self.columns) :
                self._prom_to_idx['Promoter%s%d'%(row,column)] = (i,j)
    def map_name_to_idx(self,name) :
        return self._prom_to_idx[name]
    def __getitem__(self,name) :
        if name == 'Colision' :
            return self.C
        try :
            return self.M[self.map_name_to_idx(name)]
        except KeyError :
            print "Error: %s does not exist"%(name)
    def __setitem__(self,name,val) :
        if name == 'Colision' :
            self.C = val
        else :
            try :
                self.M[self.map_name_to_idx(name)] = val
            except KeyError :
                print "Error: %s does not exist"%(name)

## Analysis: the basics

### How many integrations?

In [None]:
# get the number of integrations per replicate
for rep in reps :
    print '%s : %d integrations'%(rep.name,rep.data.size)

### How many promoters were found in each replicate?

In [None]:
# get the promoters that were found in that replicate
for rep in reps :
    rep.promoters = np.unique(rep.data['promoter'])
    
# do the two sets coincide?
for p in reps[0].promoters :
    if p not in reps[1].promoters :
        print "%s not found in replicate 2"%(p)
        
# do the two sets coincide?
for p in reps[1].promoters :
    if p not in reps[0].promoters :
        print "%s not found in replicate 1"%(p)

Okay this stuff needs to be cleared with Marc.

### How many collisions?

In [None]:
for rep in reps :
    rep.collisions = rep.data[rep.data['promoter']=='Colision']
    print '%s : %d collisions (%.2f%%)'%(rep.name,rep.collisions.size,
                                rep.collisions.size/float(rep.data.size)*100)

### How many integrations per promoter?

In [None]:
# filter out the "good" integrations, that is, the barcodes not identified as collisions
for rep in reps :
    rep.integrations = rep.data[rep.data['promoter']!='Colision']

In [None]:
# for each of the replicates, fill in the matrix of how many integrations corresponding to
# each promoter we have and plot
for rep in reps :
    rep.promoter_counts = HPIPMatrix()
    for promoter in rep.promoters :
        p = rep.integrations[rep.integrations['promoter']==promoter]
        rep.promoter_counts[promoter] = p.size
    cax = plt.matshow(np.log2(1+rep.promoter_counts.M),cmap=plt.cm.Greens)
    plt.colorbar(cax)
    plt.xticks(range(len(rep.promoter_counts.columns)),rep.promoter_counts.columns)
    plt.yticks(range(len(rep.promoter_counts.rows)),rep.promoter_counts.rows)
    plt.title(rep.name,fontsize=32,y=1.1)

We see that there is quite a clear pattern in the sense that there are two experiments (labelled 1 and 6) that worked worse than the others. Experiments 7 and 8 seem to be much better.

In [None]:
for rep in reps :
    plt.bar(rep.promoter_counts.columns,rep.promoter_counts.M.sum(axis=0))
    plt.xlabel('TRIP experiment')
    plt.ylabel('Integration counts')
    plt.title(rep.name,fontsize=32)
    plt.xticks(range(1,len(rep.promoter_counts.columns)+1))
    plt.show()