# Convolutional RBM (cRBM)

This notebook takes care of implementing the basic functionality for cRBMs.
Or maybe it's just for the preliminaries, that is some simple stuff before it actually comes to the Boltzmann Machine.


## Part 1: Initializing the weight matrices and applying them to sequences

In [152]:
#import theano
import numpy as np
import Bio.SeqIO as sio
import Bio.motifs.matrix as mat
from Bio.Alphabet import IUPAC

import random
import time

In [147]:
class FASTAReader:
    
    def __init__(self, _path):
        self.path = _path
        
    def readSequencesFromFile (self, filename):
        dhsSequences = []
        for dhs in sio.parse(open(filename), 'fasta', IUPAC.unambiguous_dna):
            dhsSequences.append(dhs)
        return dhsSequences
    

In [148]:
class ConvRBM:
    
    def __init__ (self, _motifLength, _numMotifs, _alphabet=IUPAC.unambiguous_dna):
        self.motifLength = _motifLength
        self.numMotifs = _numMotifs
        self.motifs = []
        self.alphabet = _alphabet
        
    def initializePWMs (self):
        for m in range(self.numMotifs):
            self.motifs.append(self._createRandomMotif(self.motifLength, self.alphabet))
    
    def convertLetterToInt (self, letter):
        if letter == 'A' or letter == 'a':
            return 0
        elif letter == 'C' or letter == 'c':
            return 1
        elif letter == 'G' or letter == 'g':
            return 2
        else:
            return 3
        
    def convertSequenceToInt (self, seq):
        newSeq = []
        for letter in seq:
            newSeq.append(convertLetterToInt(letter))
        return newSeq
    
    def forwardPass (self, seq):
        if self.motifs == []:
            print 'Error: No motifs created so far. Try executing initializePWMs before!'
            return

        foundMotifs = np.zeros((self.numMotifs, len(seq)-self.motifLength+1))
        motifCount = 0
        for motif in self.motifs:
            pssm = motif.log_odds()
            foundMotifs[motifCount,:] = pssm.calculate(seq)
            motifCount += 1

        return foundMotifs
    
    
    def _createRandomMotif (self, motifLength, alphabet):
        counts = {}
        for letter in alphabet.letters:
            counts[letter] = [random.randint(0,100) for x in xrange(motifLength)]
        return mat.PositionWeightMatrix(alphabet, counts)
        


In [149]:
# apply the two classes to calculate a forward pass in our algorithm
seqReader = FASTAReader('.')
allSeqs = seqReader.readSequencesFromFile('../data/wgEncodeAwgDnaseUwAg10803UniPk.fa')


In [154]:
learner = ConvRBM(6, 1)
learner.initializePWMs()

start = time.time()
i = 0
lengthes = []
for seq in allSeqs:
    convoluted = learner.forwardPass(seq.seq)
    lengthes.append(len(seq))
    if i % 5000 == 0:
        print i
    i += 1
    
print
print
print "Number of filters: " + str(learner.numMotifs)
print "Number of DHSs: " + str(i)
print "Average Length of Sequences: " + str(np.array(lengthes).mean())
print "Execution Time: " + str(time.time()-start)

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000


Number of filters: 1
Number of DHSs: 171275
Average Length of Sequences: 150.0
Execution Time: 23.420541048


Some tests to learn how to do things with Biopython and Theano
===

### Do all DHS sequences have the same length by default?


In [51]:
fasta_seqs = sio.parse(open('../data/wgEncodeAwgDnaseUwAg10803UniPk.fa'), 'fasta')
count = 0
countNotSameLength = 0
for seq in fasta_seqs:
    if len(seq) != 150:
        print 'not length 150'
        countNotSameLength += 1
    count += 1

print 'Number of sequences: ' + str(count)
print 'Number of seqs with length != 150: ' + str(countNotSameLength)

Number of sequences: 171275
Number of seqs with length != 150: 0


In [52]:
import Bio.NeuralNetwork.Gene.Schema as schema
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

### How do we generate random motif matrices (PWMs or PSSMs)

In [145]:
alphabet = IUPAC.unambiguous_dna
generator = schema.RandomMotifGenerator(alphabet, 6, 10)
for i in range(3):
    x = generator.random_motif()
    print x

CTGCGAT
CGTGACCA
GTACACCGA


In [54]:
from Bio import motifs

In [55]:
x = motifs.Motif(alphabet)
len(x)

0

In [56]:
import Bio.motifs.matrix as mat
import random

In [64]:
def createRandomMotif (motifLength, alphabet):
    counts = {}
    for letter in alphabet.letters:
        counts[letter] = [random.randint(0,100) for x in xrange(motifLength)]
    return mat.PositionWeightMatrix(alphabet, counts)
#x = mat.PositionWeightMatrix(alphabet, counts)

In [143]:
x = createRandomMotif(10, alphabet)
print x

        0      1      2      3      4      5      6      7      8      9
A:   0.28   0.05   0.38   0.08   0.08   0.29   0.40   0.20   0.06   0.07
C:   0.23   0.08   0.42   0.33   0.34   0.10   0.15   0.40   0.25   0.26
G:   0.24   0.20   0.18   0.24   0.18   0.42   0.16   0.18   0.35   0.20
T:   0.24   0.66   0.02   0.35   0.40   0.20   0.29   0.22   0.33   0.47



### Are the elements of a PWM interpretable as probabilites?

In [144]:
# verify that we're dealing with probabilities by summing up over all letters for each position
for pos in range(x.length):
    c = 0
    for letter in alphabet.letters:
        c += x[letter][pos]
    print str(pos) + " -> " + str(c)

0 -> 1.0
1 -> 1.0
2 -> 1.0
3 -> 1.0
4 -> 1.0
5 -> 1.0
6 -> 1.0
7 -> 1.0
8 -> 1.0
9 -> 1.0
