# Convolutional RBM (cRBM)

This notebook takes care of implementing the basic functionality for cRBMs.
Or maybe it's just for the preliminaries, that is some simple stuff before it actually comes to the Boltzmann Machine.


## Part 1: Initializing the weight matrices and applying them to sequences

In [244]:
#import theano
import numpy as np
import Bio.SeqIO as sio
import Bio.motifs.matrix as mat
from Bio.Alphabet import IUPAC

import random
import time

In [245]:
class FASTAReader:
    
    def __init__(self, _path):
        self.path = _path
        
    def readSequencesFromFile (self, filename):
        dhsSequences = []
        for dhs in sio.parse(open(filename), 'fasta', IUPAC.unambiguous_dna):
            dhsSequences.append(dhs)
        return dhsSequences
    

In [246]:
# apply the two classes to calculate a forward pass in our algorithm
seqReader = FASTAReader('.')
allSeqs = seqReader.readSequencesFromFile('../data/wgEncodeAwgDnaseUwAg10803UniPk.fa')


In [282]:
class ConvRBM:
    
    def __init__ (self, _motifLength, _numMotifs, _alphabet=IUPAC.unambiguous_dna):
        # parameters for the motifs
        self.motifLength = _motifLength
        self.numMotifs = _numMotifs
        self.motifs = []
        self.alphabet = _alphabet
        print self.numMotifs
        
        # cRBM parameters
        self.bias = np.random.rand(self.numMotifs)
        self.c = random.random()
        
    def initializePWMs (self):
        # set up PWMs
        for m in range(self.numMotifs):
            self.motifs.append(self._createRandomMotif(self.motifLength, self.alphabet))
        
    def forwardPass (self, seq):
        if self.motifs == []:
            print 'Error: No motifs created so far. Try executing initializePWMs before!'
            return

        hiddenActivation = np.zeros((self.numMotifs, len(seq)-self.motifLength+1))
        motifCount = 0
        for motif in self.motifs:
            pssm = motif.log_odds()
            hiddenActivation[motifCount,:] = self._sigmoid(pssm.calculate(seq) + self.bias[motifCount])
            motifCount += 1

        return hiddenActivation
    
    def backwardPass (self, hiddenActivation):
        
        # apply convolution on each of the channels (A, C, G, T) seperately
        restoredLength = hiddenActivation.shape[1] + self.motifLength - 1
        numOfLetters = len(self.alphabet.letters)
        reConv = np.zeros((numOfLetters, restoredLength))
        matrix = self._convertPWM2Array(self.motifs[0])
        for i in range(numOfLetters):
            reConv[i,:] = np.convolve(hiddenActivation[0], matrix[i]) + self.c
            
        # perform softmax and select index of the most promising one (results in visibleActivation)
        visibleActivation = np.zeros((1, 150))
        for i in range(restoredLength):
            visibleActivation[0,i] = np.argmax([self._softmaxActivation(reConv[:,i], x) for x in range(numOfLetters)])
            
        # convert the resulting sequence to actual letters (A, C, G, T instead of 0, 1, 2, 3)
        return self._getDNASeqFromNumericals(visibleActivation)

    
    def _createRandomMotif (self, motifLength, alphabet):
        counts = {}
        for letter in alphabet.letters:
            counts[letter] = [random.randint(0,100) for x in xrange(motifLength)]
        return mat.PositionWeightMatrix(alphabet, counts)
        
    def _sigmoid (self, x):
        return 1.0 / (1.0 + np.exp(-x))

    def _softmaxActivation (self, col, idx):
        p_all = np.sum(np.exp(col))
        p_x = np.exp(col[idx])
        return p_x / p_all

    def _getLetterToInt (self, num):
        if num == 0:
            return 'A'
        elif num == 1:
            return 'C'
        elif num == 2:
            return 'G'
        elif num == 3:
            return 'T'
        else:
            print 'ERROR: Num ' + str(num) + " not a valid char in DNA alphabet"
            return -1

    def _getDNASeqFromNumericals (self, seq):
        dna_seq = []
        for num in range(seq.shape[1]):
            dna_seq.append(self._getLetterToInt(seq[0,num]))
        return dna_seq

    def _convertPWM2Array (self, pwm):
        result = np.zeros((len(alphabet.letters), self.motifLength))
        for letter in range(len(pwm)):
            result[letter] = pwm[getLetterToInt(letter)]
        return result

In [283]:
learner = ConvRBM(6, 1)
learner.initializePWMs()

hiddenActivation = learner.forwardPass(allSeqs[246].seq)
restoredLength = hiddenActivation.shape[1] + learner.motifLength - 1
reConv = np.zeros((len(learner.alphabet.letters), restoredLength))
matrix = learner._convertPWM2Array(learner.motifs[0])
print matrix
for i in range(len(learner.alphabet.letters)):
    reConv[i,:] = np.convolve(hiddenActivation[0], matrix[i])
    
#print reConv

def softmaxActivation(col, idx):
    p_all = np.sum(np.exp(col))
    p_x = np.exp(col[idx])
    return p_x / p_all

visibleActivation = np.zeros((1, 150))
for i in range(150):
    visibleActivation[0,i] = np.argmax([softmaxActivation(reConv[:,i], x) for x in range(4)])
    
def convertNumericalToLetter(seq):
    dna_seq = []
    for num in range(seq.shape[1]):
        dna_seq.append(learner._getLetterToInt(seq[0,num]))
    return dna_seq

print "Restored:"
print convertNumericalToLetter(visibleActivation)[:20]
print
print "Original:"
print allSeqs[246].seq[:20]

print "Now the real implementation:"
print "Restored:"
print learner.backwardPass(hiddenActivation)[:20]

1
[[ 0.18  0.27  0.38  0.13  0.24  0.28]
 [ 0.44  0.17  0.22  0.55  0.2   0.05]
 [ 0.08  0.27  0.37  0.3   0.22  0.29]
 [ 0.31  0.29  0.03  0.02  0.34  0.38]]
Restored:
['C', 'C', 'C', 'C', 'A', 'C', 'C', 'T', 'A', 'C', 'A', 'G', 'C', 'T', 'T', 'G', 'C', 'C', 'T', 'G']

Original:
TTTATCCTGCAGCTCGCCTG
Now the real implementation:
Restored:
['C', 'C', 'C', 'C', 'A', 'C', 'C', 'T', 'A', 'C', 'A', 'G', 'C', 'T', 'T', 'G', 'C', 'C', 'T', 'G']


In [190]:
start = time.time()
i = 0
lengthes = []
someScores = []
for seq in allSeqs:
    convoluted = learner.forwardPass(seq.seq)
    lengthes.append(len(seq))
    if i % 5000 == 0:
        someScores.append(convoluted[0][random.randint(0, len(convoluted))])
        print str(i) + " -> " + str(someScores[-1])
    i += 1
    
print
print
print "Number of filters: " + str(learner.numMotifs)
print "Number of DHSs: " + str(i)
print "Average Length of Sequences: " + str(np.array(lengthes).mean())
print "Execution Time: " + str(time.time()-start)

5
0 -> 0.906600534916
5000 -> 0.208888828754
10000 -> 0.0
15000 -> 0.0
20000 -> 0.0
25000 -> 0.641809999943
30000 -> 0.0
35000 -> 0.658836007118
40000 -> 0.57325977087
45000 -> 0.582175374031
50000 -> 0.0
55000 -> 0.551681339741
60000 -> 0.910535812378
65000 -> 0.624538183212
70000 -> 0.606583654881
75000 -> 0.767180681229
80000 -> 0.271857708693
85000 -> 0.474154680967
90000 -> 0.0
95000 -> 0.48353984952
100000 -> 0.0
105000 -> 0.291799157858
110000 -> 0.591895580292
115000 -> 0.0
120000 -> 0.952310502529
125000 -> 0.181974470615
130000 -> 0.684669852257
135000 -> 0.493779927492
140000 -> 0.771480441093
145000 -> 0.135087832808
150000 -> 0.0
155000 -> 0.0830420479178
160000 -> 0.214970812201
165000 -> 0.0
170000 -> 0.785442829132


Number of filters: 5
Number of DHSs: 171275
Average Length of Sequences: 150.0
Execution Time: 125.519943953


Some tests to learn how to do things with Biopython and Theano
===

### Do all DHS sequences have the same length by default?


In [51]:
fasta_seqs = sio.parse(open('../data/wgEncodeAwgDnaseUwAg10803UniPk.fa'), 'fasta')
count = 0
countNotSameLength = 0
for seq in fasta_seqs:
    if len(seq) != 150:
        print 'not length 150'
        countNotSameLength += 1
    count += 1

print 'Number of sequences: ' + str(count)
print 'Number of seqs with length != 150: ' + str(countNotSameLength)

Number of sequences: 171275
Number of seqs with length != 150: 0


In [52]:
import Bio.NeuralNetwork.Gene.Schema as schema
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

### How do we generate random motif matrices (PWMs or PSSMs)

In [145]:
alphabet = IUPAC.unambiguous_dna
generator = schema.RandomMotifGenerator(alphabet, 6, 10)
for i in range(3):
    x = generator.random_motif()
    print x

CTGCGAT
CGTGACCA
GTACACCGA


In [54]:
from Bio import motifs

In [203]:
len(alphabet.letters)

4

In [56]:
import Bio.motifs.matrix as mat
import random

In [64]:
def createRandomMotif (motifLength, alphabet):
    counts = {}
    for letter in alphabet.letters:
        counts[letter] = [random.randint(0,100) for x in xrange(motifLength)]
    return mat.PositionWeightMatrix(alphabet, counts)
#x = mat.PositionWeightMatrix(alphabet, counts)

In [207]:
x = createRandomMotif(10, alphabet)
print x

        0      1      2      3      4      5      6      7      8      9
A:   0.24   0.18   0.19   0.25   0.41   0.47   0.09   0.07   0.21   0.29
C:   0.28   0.26   0.16   0.42   0.13   0.24   0.39   0.49   0.16   0.38
G:   0.20   0.24   0.22   0.00   0.14   0.15   0.46   0.43   0.63   0.18
T:   0.29   0.32   0.43   0.32   0.32   0.14   0.06   0.02   0.00   0.14



In [224]:
y = np.zeros((len(alphabet.letters), 10))

def getLetterToInt (num):
    if num == 0:
        return 'A'
    elif num == 1:
        return 'C'
    elif num == 2:
        return 'G'
    elif num == 3:
        return 'T'
    else:
        print 'ERROR: Num ' + str(num) + " not a valid char in DNA alphabet"
        return -1


for letter in range(len(x)):
    print letter
    y[letter] = x[getLetterToInt(letter)]
    letterCount += 1

np.set_printoptions(precision=2)
y

0
1
2
3


array([[ 0.24,  0.18,  0.19,  0.25,  0.41,  0.47,  0.09,  0.07,  0.21,
         0.29],
       [ 0.28,  0.26,  0.16,  0.42,  0.13,  0.24,  0.39,  0.49,  0.16,
         0.38],
       [ 0.2 ,  0.24,  0.22,  0.  ,  0.14,  0.15,  0.46,  0.43,  0.63,
         0.18],
       [ 0.29,  0.32,  0.43,  0.32,  0.32,  0.14,  0.06,  0.02,  0.  ,
         0.14]])

### Are the elements of a PWM interpretable as probabilites?

In [144]:
# verify that we're dealing with probabilities by summing up over all letters for each position
for pos in range(x.length):
    c = 0
    for letter in alphabet.letters:
        c += x[letter][pos]
    print str(pos) + " -> " + str(c)

0 -> 1.0
1 -> 1.0
2 -> 1.0
3 -> 1.0
4 -> 1.0
5 -> 1.0
6 -> 1.0
7 -> 1.0
8 -> 1.0
9 -> 1.0
