In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os, sys, struct

# 2018-06-20 An indexed dictionary
For the purpose of assigning a promoter to a barcode, it is always necessary to do some trial and error, and to consult many times the promoter-barcode dictionary. For this reason, it is useful to have a handy way of accessing the dictionary without loading all the dictionary into the memory. Therefore, to do that, I need an indexing method.

## Step 1. How to read and write a file index in Python

Suppose that I have file and I want to write to a separate file where are all the starts of the lines in the file. I will want to use the `seek` method to jump then to that particular position in the file. Let's have a look.

In [None]:
def make_file_index(fname, idx_fname) :
    
    # open the original file normally, and the index file as a
    # binary file
    with open(fname,'r') as f_in, open(idx_fname,'wb') as f_out :

        # doing a normal iteration over the file lines
        # as in 'for line in f_in' will not work combined with
        # f_in.tell(). Therefore, we need to use this other way
        # of iterating over the file.
        # From https://stackoverflow.com/a/14145118/2312821
        for line in iter(f_in.readline, '') :
            f_out.write('%s'%(struct.pack("Q", f_in.tell())))

In [None]:
# now let's open the index, and read the n-th line of the original file
def file_read_nth_line(n, fname, idx_fname) :
    # Try to read the n-th index from the idx list. With an 
    # 'IndexError' it means that the original file does not have
    # that number of lines
    with open(idx_fname, 'rb') as f :
        f.seek(n*8, os.SEEK_SET)
        i_binary = f.read(8)
        
        # if 'i_binary' is an empty string, it means we reached
        # the end of the file: that is, the original file did not
        # have that number of lines
        if i_binary == '' :
            return None
        
        # if not, then we can unpack the string and convert it to a
        # python integer, which will allow us to read from the
        # correct line in the original file (i)
        i = struct.unpack('Q', i_binary)[0]
    
    # read original file and jump to the correct line
    with open(fname, 'r') as f :
        f.seek(i, os.SEEK_SET)
        return f.readline()

In [None]:
# example: a BCL2FASTQ sample sheet
example_file = '/home/rcortini/work/CRG/projects/hpip/data/raw/iPCR/HPIP_iPCR_rep1/SampleSheet.csv'
idx_file = 'idx.txt'

# make the index
make_file_index(example_file, idx_file)

# now go to the n-th line of the original file
n = 44
print file_read_nth_line(n, example_file, idx_file)

Okay, now we have a few base functions that will allow to create and read the file index.

## Step 2. Write a sorted barcode dictionary.
We need now to go back at when we read the barcodes from all the promoter starcoded files. The smarter thing to do then would be to write the index of that file while writing the file itself.

Turns out that I don't need to write a single line of code to accomplish this. Using the `sort` GNU coreutil, I can directly sort the barcode dictionary with no option. In approximately nine minutes it is done on my computer.

## Step 3. Seek a barcode from the indexed, sorted dictionary

Now I have that the promoter-barcode dictionary is sorted, and I have the index for it, I can write a function that goes and looks for the corresponding barcode in the dictionary with log-efficiency.

In [None]:
hpip_root = '%s/work/CRG/projects/hpip'%(os.getenv('HOME'))
pbd_datadir = '%s/data/pbd'%(hpip_root)
pbd_fname = '%s/pbd.txt'%(pbd_datadir)
pbd_idx = '%s.idx'%(pbd_fname)

In [None]:
def findbcd(bcd, pbd_fname, pbd_idx) :
    
    # open the pbd index and read the value at the last position,
    # which by convention corresponds to the number of lines in the
    # original file
    with open(pbd_idx, 'rb') as f :
        f.seek(-8, os.SEEK_END)
        N = struct.unpack('Q', f.read(8))[0]
    
    # start the iterative search
    range_hi = N-1
    range_lo = 0
    while range_lo < range_hi-1 :
        mid = (range_hi+range_lo)//2
        line = file_read_nth_line(mid, pbd_fname, pbd_idx)
        this_bcd = line[:20]
        if this_bcd < bcd :
            range_lo = mid
        elif this_bcd > bcd :
            range_hi = mid
        else :
            return line
    
    # if we are here, then the barcode was not found
    return None

In [None]:
bcd = 'TTTTTTTTTTTTTTTTTTGG'
print findbcd(bcd, pbd_fname, pbd_idx)

Okay, this works very well and is quite fast.

## Bonus: object-oriented approach

This is all well and good, but the functions as they are written now are kind of inefficient because they continuously open and close the files for access. A better approach is to open them once and for all and then close the files when they are not used any more.

In [None]:
class PBD :
    def __init__(self, hpip_root) :
        # set input file names
        pbd_fname = '%s/data/pbd/pbd.txt'%(hpip_root)
        pbd_idx = '%s.idx'%(pbd_fname)
        
        # open the index and the pdb file
        self.pbd = open(pbd_fname, 'r')
        self.idx = open(pbd_idx, 'rb')
        
        # get the number of lines in the pbd by looking
        # at the last value stored in the index
        self.idx.seek(-8, os.SEEK_END)
        self.N = struct.unpack('Q', self.idx.read(8))[0]

    def read_nth_line(self, n) :
        # seek the index at position 'n*8', because the information
        # is stored in chunks of 8 bytes of size
        self.idx.seek(n*8, os.SEEK_SET)
        i_binary = self.idx.read(8)

        # if 'i_binary' is an empty string, it means we reached
        # the end of the file: that is, the original file did not
        # have that number of lines
        if i_binary == '' :
            return None

        # if not, then we can unpack the string and convert it to a
        # python integer, which will allow us to read from the
        # correct line in the original file (i)
        i = struct.unpack('Q', i_binary)[0]

        # read original file and jump to the correct line
        self.pbd.seek(i, os.SEEK_SET)
        return self.pbd.readline()
    
    def findbcd(self, bcd) :
        # open the pbd index and read the value at the last position,
        # which by convention corresponds to the number of lines in the
        # original file

        # start the iterative search
        range_hi = self.N-1
        range_lo = 0
        while range_lo < range_hi-1 :
            mid = (range_hi+range_lo)//2
            line = self.read_nth_line(mid)
            this_bcd = line[:20]
            if this_bcd < bcd :
                range_lo = mid
            elif this_bcd > bcd :
                range_hi = mid
            else :
                return line

        # if we are here, then the barcode was not found
        return None

    def __del__(self) :
        self.pbd.close()
        self.idx.close()

In [None]:
pbd = PBD(hpip_root)
bcd = 'TTTTTTTTTTTTTTTTTTGG'
print pbd.findbcd(bcd)

This is much faster than its other counterpart.