In [None]:
# Convolutional RBM (cRBM)

This notebook takes care of implementing the basic functionality for cRBMs.
Or maybe it's just for the preliminaries, that is some simple stuff before it actually comes to the Boltzmann Machine.


## Part 1: Initializing the weight matrices and applying them to sequences

In [12]:
#import theano
import numpy as np
import Bio.SeqIO as sio
import Bio.motifs.matrix as mat
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio import motifs

import random
import time

In [23]:
class FASTAReader:
    
    def __init__(self, _path):
        self.path = _path
        
    def readSequencesFromFile (self, filename):
        dhsSequences = []
        for dhs in sio.parse(open(filename), 'fasta', IUPAC.unambiguous_dna):
            dhsSequences.append(dhs)
        return dhsSequences
    
    
class JASPARReader:
    
    def __init__ (self):
        pass
    
    def readSequencesFromFile (self, filename):
        matrices = []
        for mat in motifs.parse(open(filename), 'jaspar'):
            matrices.append(mat.pwm)
        return matrices

In [25]:
matReader = JASPARReader()
pwms = matReader.readSequencesFromFile('../data/jaspar_matrices.txt')

<type 'list'>
<class 'Bio.motifs.matrix.PositionWeightMatrix'>


In [26]:
# apply the two classes to calculate a forward pass in our algorithm
seqReader = FASTAReader('.')
allSeqs = seqReader.readSequencesFromFile('../data/wgEncodeAwgDnaseUwAg10803UniPk.fa')


### The implementation of our convRBM so far

In [420]:
class ConvRBM:
    
    def __init__ (self, _motifLength, _numMotifs, _poolingFactor=1, _alphabet=IUPAC.unambiguous_dna):
        # parameters for the motifs
        self.motifLength = _motifLength
        self.numMotifs = _numMotifs
        self.motifs = []
        self.alphabet = _alphabet
        self.poolingFactor = _poolingFactor
        
        # cRBM parameters
        self.bias = np.random.rand(self.numMotifs)
        self.c = random.random()
        
    def initializePWMs (self):
        # set up PWMs
        for m in range(self.numMotifs):
            self.motifs.append(self._createRandomMotif(self.motifLength, self.alphabet))
        
    def forwardPass (self, seq):
        
        # check that we actually have some motifs to do convolution on
        if self.motifs == []:
            print 'Error: No motifs created so far. Try executing initializePWMs before!'
            return
        if (len(seq)-self.motifLength+1) % self.poolingFactor != 0:
            print 'Dimension mismatch: cannot create pooling layer because it would not fit!'

        # perform convolution of motif and sequence (that is, apply the motif to the sequence)
        hiddenActivation = np.zeros((2*self.numMotifs, len(seq)-self.motifLength+1))
        motifCount = 0
        for motif in self.motifs:
            pssm = motif.log_odds()
            
            # apply convolution on both strands
            hiddenActivation[motifCount*2,:] = pssm.calculate(seq) + self.bias[motifCount]
            hiddenActivation[motifCount*2+1,:] = pssm.calculate(seq.reverse_complement()) + self.bias[motifCount]
            hiddenActivation[motifCount*2:motifCount*2+2,:] = self._probMaxPooling(hiddenActivation[motifCount*2:motifCount*2+2,:])
            motifCount += 1

        return hiddenActivation
    
    def backwardPass (self, hiddenActivation):
        
        # apply convolution on all of the filters
        restoredLength = hiddenActivation.shape[1] + self.motifLength - 1
        numOfLetters = len(self.alphabet.letters)
        reConv = np.zeros((numOfLetters, restoredLength))

        #start = time.time()
        for k in range(len(self.motifs)):
            # apply convolution on each of the channels (A, C, G, T) seperately
            reConv = np.zeros((numOfLetters, restoredLength))
            matrix = self._convertPWM2Array(self.motifs[k])
            for i in range(numOfLetters):
                conv1 = np.convolve(hiddenActivation[k], matrix[i])
                conv2 = np.convolve(hiddenActivation[k+1], matrix[i])
                reConv[i,:] = reConv[i,:] + conv1 + conv2 + self.c

        #convTime = time.time()
        # perform softmax and select index of the most promising one (results in visibleActivation)
        visibleActivation = np.zeros((1, restoredLength))
        
        # calculate exp for whole matrix
        reConv = np.exp(reConv)
        
        # calculate the sum (over all four letters for whole sequence)
        sums = np.sum(reConv, 0)

        # divide by the sum
        for i in range(numOfLetters):
            reConv[i,:] = reConv[i,:] / sums[i]
            
        # and the maximum is our letter...
        visibleActivation = np.argmax(reConv, 0)
        
        #print "Done with Softmax in: " + str((time.time()-convTime)*1000)
        # convert the resulting sequence to actual letters (A, C, G, T instead of 0, 1, 2, 3)
        return self._getDNASeqFromNumericals(visibleActivation)

    def _probMaxPooling (self, h_k):

        # first of all some easy definitions
        l = h_k.shape[1]
        numOfGroups = l/self.poolingFactor
        P = np.zeros((2, l))

        # exponent of everything
        ex = np.exp(h_k)
        
        # reshape s.t. each group forms one row
        newDim = (numOfGroups, -1)
        reordered = np.append(ex[0].reshape(newDim), ex[1].reshape(newDim), 1)
        #print "Shape of reordered: " + str(reordered.shape)
        # calculate denominators (sum of all rows)
        denoms = np.sum(reordered, 1) + 1 # denoms for all groups (add 1 to have log. unit)
        res = np.argmax(reordered.T / denoms, 0)

        # calculate the actual values of the pooling layer P
        for group in range(numOfGroups):
            if reordered[group,res[group]] > 1: # check if really any element from P should be active
                # we don't care about strands so just set res = res/2 for the index
                idx = group * self.poolingFactor + int(res[group]/2)
                P[res[group] % 2, idx] = reordered[group,res[group]]
        return P
        
    def _createRandomMotif (self, motifLength, alphabet):
        counts = {}
        for letter in alphabet.letters:
            counts[letter] = [random.randint(0,100) for x in xrange(motifLength)]
        return mat.PositionWeightMatrix(alphabet, counts)
        
    def _sigmoid (self, x):
        return 1.0 / (1.0 + np.exp(-x))

    def _softmaxActivation (self, col, idx):
        p_all = np.sum(np.exp(col))
        p_x = np.exp(col[idx])
        return p_x / p_all

    def _getLetterToInt (self, num):
        if num == 0:
            return 'A'
        elif num == 1:
            return 'C'
        elif num == 2:
            return 'G'
        elif num == 3:
            return 'T'
        else:
            print 'ERROR: Num ' + str(num) + " not a valid char in DNA alphabet"
            return -1

    def _getDNASeqFromNumericals (self, seq):
        dna_seq = []
        for num in range(seq.shape[0]):
            dna_seq.append(self._getLetterToInt(seq[num]))
        return dna_seq

    def _convertPWM2Array (self, pwm):
        result = np.zeros((len(self.alphabet.letters), self.motifLength))
        for letter in range(len(pwm)):
            result[letter] = pwm[self._getLetterToInt(letter)]
        return result

### Test with a simple test sequence to verify that forward and backward pass work well

In [421]:
learner = ConvRBM(4, 2)
learner.bias = np.zeros(2)
learner.c = 0
# create PWM (that will look for a sequence of 4 Gs)
counts = {}
for letter in learner.alphabet.letters:
    if letter != 'G':
        counts[letter] = [0 for x in xrange(learner.motifLength)]
    else:
        counts[letter] = [1 for x in xrange(learner.motifLength)]
kernel = mat.PositionWeightMatrix(learner.alphabet, counts)
learner.motifs.append(kernel)

# create second kernel that will look for ACGT
counts = {}
for letter in range(len(learner.alphabet.letters)):
    counts[learner._getLetterToInt(letter)] = [int(x == letter) for x in xrange(learner.motifLength)]

kernel = mat.PositionWeightMatrix(learner.alphabet, counts)
learner.motifs.append(kernel)

# set the cRBMs other params to 0
learner.bias = np.zeros(learner.numMotifs)
learner.c = 0

# now test forward pass on a simple sequence
testSeq = Seq("ACGTGGGG", learner.alphabet)
h = learner.forwardPass(testSeq)
print "Hidden Layer:"
print h

maxPooled = learner._probMaxPooling(h[:2])
reconstructed = learner.backwardPass(h)
print "Reconstruction:"
print reconstructed

# Should be able to completely reconstruct the sequence because we gave two kernels
# which both can be hold responsible for a portion of the sequence.

Hidden Layer:
[[    0.             0.             0.             0.          2980.95798704]
 [    0.             0.             0.             0.             0.        ]
 [ 2980.95798704     0.             0.             0.             0.        ]
 [    0.             0.             0.             0.          2980.95798704]]
Reconstruction:
['A', 'C', 'G', 'T', 'A', 'A', 'A', 'A']




### Simple Speed Test

In [414]:
x = ConvRBM(4, 10, 1)
x.initializePWMs()

testSeq = allSeqs[7190]
startForward = time.time()
h = x.forwardPass(testSeq.seq)
print "Time for forward: " + str((time.time()-startForward)*1000)
startBackward = time.time()
x.backwardPass(h)
print "Time for backward: " + str((time.time()-startBackward)*1000)

Time for forward: 3.92293930054
Time for backward: 1.24907493591


### Use the JASPAR data and see whether the sequences can be reproduced

In [415]:
l = 6
numMats = len(pwms)
avgLength = np.mean([len(x[0]) for x in pwms])
matsOfSpecificLength = [x for x in pwms if len(x[0]) == l]
avgRedLength = np.mean([len(x[0]) for x in matsOfSpecificLength])
print "Total number of JASPAR matrices: " + str(numMats)
print "Average motif length (k-mer length): " + str(avgLength)
print
print "Number of motifs with length " + str(l) + " : " + str(len(matsOfSpecificLength))
print "Verfication: " + str(avgRedLength)

cRBM = ConvRBM(11, len(matsOfSpecificLength))
cRBM.bias = np.zeros(len(matsOfSpecificLength))
cRBM.c = 0
# insert our pwms
cRBM.motifs = matsOfSpecificLength

# perform forward and backward pass
correct = []
errors = []
times = []
for i in range(1000):
    testSeq = allSeqs[random.randrange(0, len(allSeqs))].seq
    start = time.time()
    hiddenActivation = learner.forwardPass(testSeq)
    restored = learner.backwardPass(hiddenActivation)
    times.append(time.time()-start)
    
    # count the differences between the two sequences
    differences = 0
    for elem in range(len(string)):
        if string[elem] != testSeq[elem]:
            differences += 1

    #print "Correct: " + str(len(string)-differences)
    #print "Errors: " + str(differences)
    correct.append(len(string)-differences)
    errors.append(differences)
    
print "average correct: " + str(np.mean(correct))
print "average error: " + str(np.mean(errors))
print "var of error: " + str(np.var(errors))
print "average time for forward and backward pass (in ms): " + str(np.mean(times)*1000)

Total number of JASPAR matrices: 593
Average motif length (k-mer length): 10.7993254637

Number of motifs with length 6 : 37
Verfication: 6.0
average correct: 29.28
average error: 120.72
var of error: 208.7816
average time for forward and backward pass (in ms): 0.840673446655




### Verify that the forward and backward pass do anything meaningful

In [416]:
learner = ConvRBM(6, 1)
learner.initializePWMs()

hiddenActivation = learner.forwardPass(allSeqs[246].seq)
restoredLength = hiddenActivation.shape[1] + learner.motifLength - 1
reConv = np.zeros((len(learner.alphabet.letters), restoredLength))
matrix = learner._convertPWM2Array(learner.motifs[0])
print matrix
for i in range(len(learner.alphabet.letters)):
    reConv[i,:] = np.convolve(hiddenActivation[0], matrix[i])
    
#print reConv

def softmaxActivation(col, idx):
    p_all = np.sum(np.exp(col))
    p_x = np.exp(col[idx])
    return p_x / p_all

visibleActivation = np.zeros((1, 150))
for i in range(150):
    visibleActivation[0,i] = np.argmax([softmaxActivation(reConv[:,i], x) for x in range(4)])
    
def convertNumericalToLetter(seq):
    dna_seq = []
    for num in range(seq.shape[1]):
        dna_seq.append(learner._getLetterToInt(seq[0,num]))
    return dna_seq

print "Restored:"
print convertNumericalToLetter(visibleActivation)[:20]
print
print "Original:"
print allSeqs[246].seq[:20]

print "Now the real implementation:"
print "Restored:"
print learner.backwardPass(hiddenActivation)[:20]

[[ 0.39320388  0.10822511  0.09146341  0.30697674  0.3627907   0.1969697 ]
 [ 0.04368932  0.35497835  0.40243902  0.22790698  0.25116279  0.37373737]
 [ 0.29126214  0.2987013   0.29878049  0.43255814  0.24651163  0.36868687]
 [ 0.27184466  0.23809524  0.20731707  0.03255814  0.13953488  0.06060606]]
Restored:
['A', 'C', 'C', 'A', 'A', 'C', 'C', 'G', 'G', 'C', 'G', 'G', 'C', 'G', 'A', 'C', 'C', 'G', 'C', 'C']

Original:
TTTATCCTGCAGCTCGCCTG
Now the real implementation:
Restored:
['A', 'C', 'C', 'A', 'A', 'C', 'C', 'G', 'G', 'C', 'G', 'G', 'C', 'G', 'A', 'C', 'C', 'G', 'C', 'C']


### Test the forward pass on the whole set of sequences

In [417]:
start = time.time()
i = 0
lengthes = []
someScores = []
for seq in allSeqs:
    convoluted = learner.forwardPass(seq.seq)
    lengthes.append(len(seq))
    if i % 5000 == 0:
        someScores.append(convoluted[0][random.randint(0, len(convoluted))])
        print str(i) + " -> " + str(someScores[-1])
    i += 1
    
print
print
print "Number of filters: " + str(learner.numMotifs)
print "Number of DHSs: " + str(i)
print "Average Length of Sequences: " + str(np.array(lengthes).mean())
print "Execution Time: " + str(time.time()-start)

0 -> 1.97112977416
5000 -> 0.0
10000 -> 0.0
15000 -> 0.0
20000 -> 0.0
25000 -> 0.0
30000 -> 3.55622096338
35000 -> 1.45209852838
40000 -> 0.0
45000 -> 0.0
50000 -> 29.8636840454
55000 -> 0.0
60000 -> 1.19763432774
65000 -> 0.0
70000 -> 10.3491168534
75000 -> 7.82986486681
80000 -> 9.85699920243
85000 -> 3.7478067675
90000 -> 0.0
95000 -> 1.76197531167
100000 -> 0.0
105000 -> 20.3408063244
110000 -> 4.61055759714
115000 -> 9.64909205092
120000 -> 2.14293706454
125000 -> 0.0
130000 -> 1.22137467903
135000 -> 0.0
140000 -> 0.0
145000 -> 0.0
150000 -> 1.53264213495
155000 -> 0.0
160000 -> 10.960086437
165000 -> 11.8790462602
170000 -> 7.42613369813


Number of filters: 1
Number of DHSs: 171275
Average Length of Sequences: 150.0
Execution Time: 67.5745389462


### Test both passes on all sequences using parallelization (just CPU)

In [11]:
from multiprocessing.pool import Pool

def calculatePassesForSeqs(seqs):
    print "Started thread with " + str(len(seqs)) + " Sequences!"
    lengthes = []
    for seq in seqs:
        hiddenActivation = learner.forwardPass(seq.seq)
        reconstruction = learner.backwardPass(hiddenActivation)
        lengthes.append(len(seq))
    return np.mean(lengthes)


cpu_count = 4
print "AVAILABLE CPUs: " + str(cpu_count)
sizePerCPU = len(allSeqs) / cpu_count
p = Pool(processes = cpu_count)
sublists = []
for i in range(cpu_count):
    if not i == cpu_count-1:
        sublists.append(allSeqs[i*sizePerCPU:(i+1)*sizePerCPU])
    else:
        sublists.append(allSeqs[i*sizePerCPU:])
start = time.time()
result = p.map(calculatePassesForSeqs, sublists)
print result
print
print
print "Number of filters: " + str(learner.numMotifs)
print "Number of DHSs: " + str(len(allSeqs))
print "Execution Time: " + str(time.time()-start)

p.close()

AVAILABLE CPUs: 4


OSError: [Errno 12] Cannot allocate memory

In [10]:
p.close()

NameError: name 'p' is not defined

Some tests to learn how to do things with Biopython and Theano
===

### Do all DHS sequences have the same length by default?


In [51]:
fasta_seqs = sio.parse(open('../data/wgEncodeAwgDnaseUwAg10803UniPk.fa'), 'fasta')
count = 0
countNotSameLength = 0
for seq in fasta_seqs:
    if len(seq) != 150:
        print 'not length 150'
        countNotSameLength += 1
    count += 1

print 'Number of sequences: ' + str(count)
print 'Number of seqs with length != 150: ' + str(countNotSameLength)

Number of sequences: 171275
Number of seqs with length != 150: 0


### How do we generate random motif matrices (PWMs or PSSMs)

In [52]:
import Bio.NeuralNetwork.Gene.Schema as schema
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

In [145]:
alphabet = IUPAC.unambiguous_dna
generator = schema.RandomMotifGenerator(alphabet, 6, 10)
for i in range(3):
    x = generator.random_motif()
    print x

CTGCGAT
CGTGACCA
GTACACCGA


In [54]:
from Bio import motifs

In [203]:
len(alphabet.letters)

4

In [56]:
import Bio.motifs.matrix as mat
import random

In [64]:
def createRandomMotif (motifLength, alphabet):
    counts = {}
    for letter in alphabet.letters:
        counts[letter] = [random.randint(0,100) for x in xrange(motifLength)]
    return mat.PositionWeightMatrix(alphabet, counts)
#x = mat.PositionWeightMatrix(alphabet, counts)

In [207]:
x = createRandomMotif(10, alphabet)
print x

        0      1      2      3      4      5      6      7      8      9
A:   0.24   0.18   0.19   0.25   0.41   0.47   0.09   0.07   0.21   0.29
C:   0.28   0.26   0.16   0.42   0.13   0.24   0.39   0.49   0.16   0.38
G:   0.20   0.24   0.22   0.00   0.14   0.15   0.46   0.43   0.63   0.18
T:   0.29   0.32   0.43   0.32   0.32   0.14   0.06   0.02   0.00   0.14



In [224]:
y = np.zeros((len(alphabet.letters), 10))

def getLetterToInt (num):
    if num == 0:
        return 'A'
    elif num == 1:
        return 'C'
    elif num == 2:
        return 'G'
    elif num == 3:
        return 'T'
    else:
        print 'ERROR: Num ' + str(num) + " not a valid char in DNA alphabet"
        return -1


for letter in range(len(x)):
    print letter
    y[letter] = x[getLetterToInt(letter)]
    letterCount += 1

np.set_printoptions(precision=2)
y

0
1
2
3


array([[ 0.24,  0.18,  0.19,  0.25,  0.41,  0.47,  0.09,  0.07,  0.21,
         0.29],
       [ 0.28,  0.26,  0.16,  0.42,  0.13,  0.24,  0.39,  0.49,  0.16,
         0.38],
       [ 0.2 ,  0.24,  0.22,  0.  ,  0.14,  0.15,  0.46,  0.43,  0.63,
         0.18],
       [ 0.29,  0.32,  0.43,  0.32,  0.32,  0.14,  0.06,  0.02,  0.  ,
         0.14]])

### Are the elements of a PWM interpretable as probabilites?

In [144]:
# verify that we're dealing with probabilities by summing up over all letters for each position
for pos in range(x.length):
    c = 0
    for letter in alphabet.letters:
        c += x[letter][pos]
    print str(pos) + " -> " + str(c)

0 -> 1.0
1 -> 1.0
2 -> 1.0
3 -> 1.0
4 -> 1.0
5 -> 1.0
6 -> 1.0
7 -> 1.0
8 -> 1.0
9 -> 1.0
