# Visualization
In this notebook, we're going to explore what the model we have trained actually learned.
This involves some of the following things:
* **Verify that we actually get better while learning**
* Look at the motifs we learn
* What does the hidden layer tell us about the model

In [1]:
%matplotlib inline

# some always important inputs
import sys
import os
import random
import time
import numpy as np
import cPickle

# the underlying convRBM implementation
sys.path.append(os.path.abspath('../code'))
from convRBM import CRBM
import getData as dataRead

# plotting and data handling
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

# the biopython stuff
import Bio.SeqIO as sio
import Bio.motifs.matrix as mat
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio import motifs as mot

ERROR (theano.sandbox.cuda): Failed to compile cuda_ndarray.cu: libcublas.so.7.0: cannot open shared object file: No such file or directory
ERROR:theano.sandbox.cuda:Failed to compile cuda_ndarray.cu: libcublas.so.7.0: cannot open shared object file: No such file or directory


Couldn't import dot_parser, loading of dot files will not be possible.


## Read in the data and a previously trained model
This part of the notebook trains a convolutional RBM on the DHS data. This may take a lot of time but only once we trained it, will it be possible to do the visualization of what the model learnt.

In [48]:
seqReader = dataRead.FASTAReader()
allSeqs = seqReader.readSequencesFromFile('../data/wgEncodeAwgDnaseUwAg10803UniPk.fa')

#data = [allSeqs[random.randrange(0,len(allSeqs))] for i in range(20000)]
data = allSeqs
train_set, test_set = train_test_split(data, test_size=0.1)
print "Training set size: " + str(len(train_set))
print "Test set size: " + str(len(test_set))

start = time.time()
trainingData = np.array([dataRead.getMatrixFromSeq(t) for t in train_set])
testingData = np.array([dataRead.getMatrixFromSeq(t) for t in test_set])
print "Conversion of test set in (in ms): " + str((time.time()-start)*1000)

# read in the model
learner = CRBM(9, 20, 0.001, 2)
learner.loadModel('../code/models/model_longRun_wholeData_good.pkl')

Training set size: 154147
Test set size: 17128
ERROR. LETTER N DOES NOT EXIST!
ERROR. LETTER N DOES NOT EXIST!
Conversion of test set in (in ms): 25589.1401768


In [49]:
learner.motifs.get_value()[0]

array([[[-1.51438737, -1.26050794, -1.2018342 , -1.41881883, -1.37554908,
         -0.94830269, -0.93635738, -1.43789268, -1.56744576],
        [-0.60659784, -1.22122097, -0.77071369, -1.24265921, -1.39364493,
         -1.36586833, -0.84354174, -0.90965325, -0.65799713],
        [-0.98905933, -1.30046213, -0.74781138, -0.92262971, -1.46803606,
         -1.04632795, -1.40989304, -0.45891264, -1.28915596],
        [-0.72574425, -0.79178959, -1.02306497, -1.51537621, -1.13826168,
         -1.06528139, -0.93671036, -1.1724515 , -0.88690335]]], dtype=float32)

## Time for visualizing things

### Some basic funcions to get motifs from the matrices

In [50]:
def getLetterToInt (num):
    if num == 0:
        return 'A'
    elif num == 1:
        return 'C'
    elif num == 2:
        return 'G'
    elif num == 3:
        return 'T'
    else:
        print 'ERROR: Num ' + str(num) + " not a valid char in DNA alphabet"
        return -1

def createMotifFromMatrix (matrix, alphabet=IUPAC.unambiguous_dna):
    assert matrix.shape[0] == 4
    
    # transform the matrix such that the log odds are taken away
    psm = matrix - np.log(0.25) # 0.25 if we treat all letters as equally probable
    psm = np.exp(psm)
    psm = psm / psm.sum(axis=1, keepdims=True)
    
    # make this matrix a valid motif
    counts = {}
    for row in range(4):
        counts[getLetterToInt(row)] = (psm[row]).tolist()
    motif = mot.Motif(alphabet=alphabet, instances=None, counts=counts)
    return motif
        
        

In [52]:
# first, get the motifs into single 2D matrices within a list
motifs = []
M = learner.motifs.get_value()
for i in range(0, M.shape[0], 2): # only add positive strands...
    motifs.append(M[i,0]) # second dim is 1, so just make it 2D
    

In [53]:
t = motifs[4]
mt = createMotifFromMatrix(t)
print mt.format('transfac')

P0      A      C      G      T
01 0.14889395236968994141 0.090371839702129364014 0.13606710731983184814 0.088590644299983978271      N
02 0.063327021896839141846 0.10257748514413833618 0.34385627508163452148 0.052730116993188858032      G
03 0.13056506216526031494 0.17629972100257873535 0.05597451329231262207 0.10194955021142959595      N
04 0.13926328718662261963 0.058216303586959838867 0.037931937724351882935 0.10312003642320632935      N
05 0.10609019547700881958 0.07831121981143951416 0.13704107701778411865 0.14290474355220794678      N
06 0.07411785423755645752 0.25607964396476745605 0.042066976428031921387 0.11731642484664916992      C
07 0.14179807901382446289 0.064862936735153198242 0.092090621590614318848 0.060077965259552001953      N
08 0.12704397737979888916 0.054472073912620544434 0.073619559407234191895 0.17715586721897125244      N
09 0.068900637328624725342 0.11880885809659957886 0.081351920962333679199 0.15615472197532653809      N
XX
//



In [54]:
def weblogo(motif, fname, file_format="png_print", version="2.8.2", **kwds): 
    from Bio._py3k import urlopen, urlencode, Request 
    frequencies = motif.format('transfac') 
    url = 'http://weblogo.threeplusone.com/create.cgi' 
    values = {'sequences': frequencies, 
                    'format': file_format.lower(), 
                    'stack_width': 'medium', 
                    'stack_per_line': '40', 
                    'alphabet': 'alphabet_dna', 
                    'ignore_lower_case': True, 
                    'unit_name': "bits", 
                    'first_index': '1', 
                    'logo_start': '1', 
                    'logo_end': str(motif.length), 
                    'composition': "comp_auto", 
                    'percentCG': '', 
                    'scale_width': True, 
                    'show_errorbars': True, 
                    'logo_title': '', 
                    'logo_label': '', 
                    'show_xaxis': True, 
                    'xaxis_label': '', 
                    'show_yaxis': True, 
                    'yaxis_label': '', 
                    'yaxis_scale': 'auto', 
                    'yaxis_tic_interval': '1.0', 
                    'show_ends': True, 
                    'show_fineprint': True, 
                    'color_scheme': 'color_auto', 
                    'symbols0': '', 
                    'symbols1': '', 
                    'symbols2': '', 
                    'symbols3': '', 
                    'symbols4': '', 
                    'color0': '', 
                    'color1': '', 
                    'color2': '', 
                    'color3': '', 
                    'color4': '', 
                    } 
    values.update(dict((k, "" if v is False else str(v)) for k, v in kwds.items()))
    data = urlencode(values).encode("utf-8")
    req = Request(url, data)
    response = urlopen(req)
    with open(fname, "wb") as f: 
        im = response.read()
        f.write(im)
    f.close()

In [55]:
weblogo(mt, 'test.png')
#freqs = mt.format('transfac')
#print freqs

In [56]:
count = 0
for m in motifs:
    motif = createMotifFromMatrix(m)
    weblogo(motif, 'motifs/learnedMotif_'+str(count)+'.png')
    count += 1
