# Visualization
In this notebook, we're going to explore what the model we have trained actually learned.
This involves some of the following things:
* **Verify that we actually get better while learning**
* **Look at the motifs we learn**
* What does the hidden layer tell us about the model

In [1]:
# some always important inputs
import sys
import os
import random
import time
import numpy as np
import cPickle
#import PIL

# the underlying convRBM implementation
sys.path.append(os.path.abspath('../code'))
from convRBM import CRBM
import getData as dataRead

# plotting and data handling
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

# the biopython stuff
import Bio.SeqIO as sio
import Bio.motifs.matrix as mat
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio import motifs as mot

ERROR (theano.sandbox.cuda): Failed to compile cuda_ndarray.cu: libcublas.so.7.0: cannot open shared object file: No such file or directory
ERROR:theano.sandbox.cuda:Failed to compile cuda_ndarray.cu: libcublas.so.7.0: cannot open shared object file: No such file or directory


Couldn't import dot_parser, loading of dot files will not be possible.


## Read in the data and a previously trained model
This part of the notebook trains a convolutional RBM on the DHS data. This may take a lot of time but only once we trained it, will it be possible to do the visualization of what the model learnt.

In [23]:
# read in the model
learner = CRBM(file_name='../../models/trainedModel_2016_01_22_03_43.pkl.zip')

In [24]:
print learner.motifs.get_value()[0]

[[[ -42.84947586   71.13379669  159.7800293  -250.67669678  -31.48116493
    313.79095459 -280.55813599  -50.11504364  235.35887146  134.06730652
   -219.19674683]
  [ 457.17175293   59.71633911 -232.09127808 -329.79992676  353.35272217
   -297.59786987  -10.93707085  661.60986328 -132.41362    -235.93048096
   -163.76266479]
  [-163.91108704 -234.77958679 -132.28271484  661.8359375   -10.12977695
   -299.11599731  353.77526855 -329.71801758 -231.79968262   60.06472397
    457.32302856]
  [-219.21224976  135.13908386  234.99446106  -51.77979279 -281.13021851
    314.58258057  -32.34181976 -250.54194641  160.02111816   72.5296936
    -43.98494339]]]


### Some basic funcions to get motifs from the matrices

In [42]:
def getLetterToInt (num):
    if num == 0:
        return 'A'
    elif num == 1:
        return 'C'
    elif num == 2:
        return 'G'
    elif num == 3:
        return 'T'
    else:
        print 'ERROR: Num ' + str(num) + " not a valid char in DNA alphabet"
        return -1

def createMotifFromMatrix (matrix, alphabet=IUPAC.unambiguous_dna):
    assert matrix.shape[0] == 4
    
    # transform the matrix such that the log odds are taken away
    # matrix_ij = log(foreground/background) <=> log(foreground) - log(background)
    psm = matrix + np.log(0.25) # 0.25 if we treat all letters as equally probable
    psm = np.exp(psm)
    psm = psm / psm.sum(axis=1, keepdims=True)
    
    # make this matrix a valid motif
    counts = {}
    for row in range(4):
        counts[getLetterToInt(row)] = (psm[row]).tolist()
    motif = mot.Motif(alphabet=alphabet, instances=None, counts=counts)
    return motif


def weblogo(motif, file_format="png", version="2.8.2", **kwds): 
    from Bio._py3k import urlopen, urlencode, Request 
    frequencies = motif.format('transfac') 
    url = 'http://weblogo.threeplusone.com/create.cgi' 
    values = {'sequences': frequencies, 
                    'format': file_format.lower(), 
                    'stack_width': 'medium', 
                    'stack_per_line': '40', 
                    'alphabet': 'alphabet_dna', 
                    'ignore_lower_case': True, 
                    'unit_name': "bits", 
                    'first_index': '1', 
                    'logo_start': '1', 
                    'logo_end': str(motif.length), 
                    'composition': "comp_auto", 
                    'percentCG': '', 
                    'scale_width': True, 
                    'show_errorbars': False, 
                    'logo_title': '', 
                    'logo_label': '', 
                    'show_xaxis': False, 
                    'xaxis_label': '', 
                    'show_yaxis': False, 
                    'yaxis_label': '', 
                    'yaxis_scale': 'auto', 
                    'yaxis_tic_interval': '1.0', 
                    'show_ends': False, 
                    'show_fineprint': False, 
                    'color_scheme': 'color_auto', 
                    'symbols0': '', 
                    'symbols1': '', 
                    'symbols2': '', 
                    'symbols3': '', 
                    'symbols4': '', 
                    'color0': '', 
                    'color1': '', 
                    'color2': '', 
                    'color3': '', 
                    'color4': '', 
                    } 
    values.update(dict((k, "" if v is False else str(v)) for k, v in kwds.items()))
    data = urlencode(values).encode("utf-8")
    req = Request(url, data)
    response = urlopen(req)
    return response


def getLogoListFrom4DMatrix(matrix):
    images = []
    for motifNum in range(matrix.shape[0]):
        m = createMotifFromMatrix(matrix[motifNum,0])
        reader = weblogo(m)
        images.append(plt.imread(reader))
    return images

import math
bestSplit = lambda x: (round(math.sqrt(x)), math.ceil(x / round(math.sqrt(x))))

def getObserverIndex():
    count = 0
    for obs in learner.observers:
        if "motif" in obs.name.lower():
            return count
        count += 1

In [44]:
learner.printHyperParams()
print len(learner.observers[getObserverIndex()].scores)

{'learning_rate': 1e-05,
 'motif_length': 11,
 'number_of_motifs': 15,
 'pooling_factor': 5}
1001


## Make a video from the motifs in which we have a subplot of all motifs per frame!
For that, we have to find out how we can simply get the image without writing it to disk first.
Then, we can use python multimedia capabilities for some nice plotting!

In [45]:
# get the logos for all scores during training
observerIndex = getObserverIndex()
logosOverTime = []
numberOfEpochs = 200
frames = min(len(learner.observers[observerIndex].scores), numberOfEpochs)
for timeSlice in range(frames):
    allMotifsPerSlice = learner.observers[observerIndex].scores[timeSlice]
    logosOverTime.append(getLogoListFrom4DMatrix(allMotifsPerSlice))
    print "Got Logos for Time/Epoch " + str(timeSlice)

Got Logos for Time/Epoch 0
Got Logos for Time/Epoch 1
Got Logos for Time/Epoch 2
Got Logos for Time/Epoch 3
Got Logos for Time/Epoch 4
Got Logos for Time/Epoch 5
Got Logos for Time/Epoch 6
Got Logos for Time/Epoch 7
Got Logos for Time/Epoch 8
Got Logos for Time/Epoch 9
Got Logos for Time/Epoch 10
Got Logos for Time/Epoch 11
Got Logos for Time/Epoch 12
Got Logos for Time/Epoch 13
Got Logos for Time/Epoch 14
Got Logos for Time/Epoch 15
Got Logos for Time/Epoch 16
Got Logos for Time/Epoch 17
Got Logos for Time/Epoch 18
Got Logos for Time/Epoch 19
Got Logos for Time/Epoch 20
Got Logos for Time/Epoch 21
Got Logos for Time/Epoch 22
Got Logos for Time/Epoch 23
Got Logos for Time/Epoch 24
Got Logos for Time/Epoch 25
Got Logos for Time/Epoch 26
Got Logos for Time/Epoch 27
Got Logos for Time/Epoch 28
Got Logos for Time/Epoch 29
Got Logos for Time/Epoch 30
Got Logos for Time/Epoch 31
Got Logos for Time/Epoch 32
Got Logos for Time/Epoch 33
Got Logos for Time/Epoch 34
Got Logos for Time/Epoch 35
Go

In [51]:
from matplotlib import animation

fig = plt.figure(figsize=(30,13))
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0, wspace=0)
frame_text = fig.text(0.95, 0.95,
                      'Epoch: ' + str(0),
                      verticalalignment='bottom',
                      horizontalalignment='right',
                      color='green', fontsize=30)
allMotifsOverTime = learner.observers[observerIndex].scores
print allMotifsOverTime[0].shape

x, y = bestSplit(allMotifsOverTime[0].shape[0])
print x, y
axesList = []

def init():
    print "in init"
    for i in range(allMotifsOverTime[0].shape[0]):
        ax = fig.add_subplot(x, y, i+1, xticks=[], yticks=[])
        im = ax.imshow(logosOverTime[0][i])
        axesList.append(im)
    print len(axesList)
        
def printFrame(frameNr):
    numMotifs = allMotifsOverTime[frameNr].shape[0]
    for motif in range(numMotifs):
        axesList[motif].set_data(logosOverTime[frameNr][motif])
        #ax.imshow(logosOverTime[frameNr][motif])
    frame_text.set_text('Epoch: ' + str(frameNr))

anim = animation.FuncAnimation(fig,
                               printFrame,
                               init_func=init,
                               frames=frames,
                               interval=200, repeat=True)
anim.save('test.mp4', fps=10)
#plt.show()

(30, 1, 4, 11)
5.0 6.0
in init
30
