# Visualization
In this notebook, we're going to explore what the model we have trained actually learned.
This involves some of the following things:
* **Verify that we actually get better while learning**
* **Look at the motifs we learn**
* What does the hidden layer tell us about the model

In [1]:
# some always important inputs
import sys
import os
import random
import time
import numpy as np
import cPickle
#import PIL

# the underlying convRBM implementation
sys.path.append(os.path.abspath('../code'))
from convRBM import CRBM
import getData as dataRead

# plotting and data handling
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

# the biopython stuff
import Bio.SeqIO as sio
import Bio.motifs.matrix as mat
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio import motifs as mot

ERROR (theano.sandbox.cuda): Failed to compile cuda_ndarray.cu: libcublas.so.7.0: cannot open shared object file: No such file or directory
ERROR:theano.sandbox.cuda:Failed to compile cuda_ndarray.cu: libcublas.so.7.0: cannot open shared object file: No such file or directory


Couldn't import dot_parser, loading of dot files will not be possible.


## Read in the data and a previously trained model
This part of the notebook trains a convolutional RBM on the DHS data. This may take a lot of time but only once we trained it, will it be possible to do the visualization of what the model learnt.

In [2]:
# read in the model
print os.getcwd()
learner = CRBM(file_name='../code/models/trainedModel_2016_01_19_18_42.pkl.zip')

/home/sasse/ConvolutionalRBM/notebooks


In [3]:
print learner.motifs.get_value()[0]

[[[ 0.04918326 -0.32120562 -1.10284591 -0.53361011  0.35052255  0.35646319
   -1.06514406  0.56772792  1.52398145]
  [ 0.7236135   1.78293872  1.56473255  0.00804297  1.43115532  0.6812247
    0.79523093  1.72713494 -0.81963223]
  [-0.07839292  1.42034292  0.04688607  0.99685663  1.03838348 -0.67912745
    1.00209916 -1.46065879  0.36329684]
  [ 1.30040765  0.00278098  1.17192209  1.41627991 -0.68514997  1.69636905
    1.56882477  1.87927997  1.01025581]]]


### Some basic funcions to get motifs from the matrices

In [4]:
def getLetterToInt (num):
    if num == 0:
        return 'A'
    elif num == 1:
        return 'C'
    elif num == 2:
        return 'G'
    elif num == 3:
        return 'T'
    else:
        print 'ERROR: Num ' + str(num) + " not a valid char in DNA alphabet"
        return -1

def createMotifFromMatrix (matrix, alphabet=IUPAC.unambiguous_dna):
    assert matrix.shape[0] == 4
    
    # transform the matrix such that the log odds are taken away
    # matrix_ij = log(foreground/background) <=> log(foreground) - log(background)
    psm = matrix + np.log(0.25) # 0.25 if we treat all letters as equally probable
    psm = np.exp(psm)
    psm = psm / psm.sum(axis=1, keepdims=True)
    
    # make this matrix a valid motif
    counts = {}
    for row in range(4):
        counts[getLetterToInt(row)] = (psm[row]).tolist()
    motif = mot.Motif(alphabet=alphabet, instances=None, counts=counts)
    return motif


def weblogo(motif, fname, file_format="png_print", version="2.8.2", **kwds): 
    from Bio._py3k import urlopen, urlencode, Request 
    frequencies = motif.format('transfac') 
    url = 'http://weblogo.threeplusone.com/create.cgi' 
    values = {'sequences': frequencies, 
                    'format': file_format.lower(), 
                    'stack_width': 'medium', 
                    'stack_per_line': '40', 
                    'alphabet': 'alphabet_dna', 
                    'ignore_lower_case': True, 
                    'unit_name': "bits", 
                    'first_index': '1', 
                    'logo_start': '1', 
                    'logo_end': str(motif.length), 
                    'composition': "comp_auto", 
                    'percentCG': '', 
                    'scale_width': True, 
                    'show_errorbars': False, 
                    'logo_title': '', 
                    'logo_label': '', 
                    'show_xaxis': False, 
                    'xaxis_label': '', 
                    'show_yaxis': False, 
                    'yaxis_label': '', 
                    'yaxis_scale': 'auto', 
                    'yaxis_tic_interval': '1.0', 
                    'show_ends': False, 
                    'show_fineprint': False, 
                    'color_scheme': 'color_auto', 
                    'symbols0': '', 
                    'symbols1': '', 
                    'symbols2': '', 
                    'symbols3': '', 
                    'symbols4': '', 
                    'color0': '', 
                    'color1': '', 
                    'color2': '', 
                    'color3': '', 
                    'color4': '', 
                    } 
    values.update(dict((k, "" if v is False else str(v)) for k, v in kwds.items()))
    data = urlencode(values).encode("utf-8")
    req = Request(url, data)
    response = urlopen(req)
    return response


def getLogoListFrom4DMatrix(matrix):
    images = []
    for motifNum in range(matrix.shape[0]):
        m = createMotifFromMatrix(matrix[motifNum,0])
        reader = weblogo(m, 'sub')
        images.append(plt.imread(reader))
    return images

import math
bestSplit = lambda x: (round(math.sqrt(x)), math.ceil(x / round(math.sqrt(x))))

In [5]:
# first, get the motifs into single 2D matrices within a list
motifs = []
M = learner.motifs.get_value()
for i in range(0, M.shape[0], 2): # only add positive strands...
    motifs.append(M[i,0]) # second dim is 1, so just make it 2D
    

### To save time and network data, create the images here and plot them later

In [19]:
images = []
for motif in motifs:
    m = createMotifFromMatrix(motif)
    reader = weblogo(m, 'sub')
    images.append(plt.imread(reader))

import math
bestSplit = lambda x: (round(math.sqrt(x)), math.ceil(x / round(math.sqrt(x))))

In [20]:
count = 1
x, y = bestSplit(len(motifs))
fig = plt.figure(figsize=(30,13))
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0, wspace=0)
for motif in motifs:
    ax = fig.add_subplot(x, y, count, xticks=[], yticks=[])
    ax.imshow(images[count-1])
    count += 1

plt.show()

In [6]:
learner.printHyperParams()
print len(learner.observers[2].scores)

{'learning_rate': 1e-05,
 'motif_length': 9,
 'number_of_motifs': 10,
 'pooling_factor': 2}
21


## Make a video from the motifs in which we have a subplot of all motifs per frame!
For that, we have to find out how we can simply get the image without writing it to disk first.
Then, we can use python multimedia capabilities for some nice plotting!

In [None]:
# get the logos for all scores during training
logosOverTime = []
for timeSlice in range(len(learner.observers[2].scores)):
    allMotifsPerSlice = learner.observers[2].scores[timeSlice]
    logosOverTime.append(getLogoListFrom4DMatrix(allMotifsPerSlice))
    print "Got Logos for Time/Epoch " + str(timeSlice)

In [8]:
from matplotlib import animation

fig = plt.figure(figsize=(30,13))
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0, wspace=0)

allMotifsOverTime = learner.observers[2].scores
print allMotifsOverTime[0].shape

x, y = bestSplit(allMotifsOverTime[0].shape[0])
print x, y

def printFrame(frameNr):
    numMotifs = allMotifsOverTime[frameNr].shape[0]
    for motif in range(numMotifs):
        ax = fig.add_subplot(x, y, motif+1, xticks=[], yticks=[])
        ax.imshow(logosOverTime[frameNr][motif])

anim = animation.FuncAnimation(fig,
                               printFrame,
                               init_func=None,
                               frames=len(learner.observers[2].scores),
                               interval=100, repeat=True)
#anim.save('test.mp4', fps=30, extra_args=['-vcodec', 'libx264'])
plt.show()

(20, 1, 4, 9)
4.0 5.0
