In [17]:
#double mutate function used in som_average_ug_split for large alignments
def double_mutate_ungapped_split(X, ug_r, ug_c):

    num_summary, seqlen, _, dims = X.shape
    idxlen_r = len(ug_r)
    idxlen_c = len(ug_c)

    mutations_matrix = np.zeros((idxlen_r,idxlen_c, dims*dims, seqlen,1,dims))

    for i1,position1 in enumerate(ug_r):

        for i2,position2 in enumerate(ug_c):

            for nuc1 in range(dims):

                for nuc2 in range(dims):

                    mut_seq = np.copy(X)
                    mut_seq[0, position1, 0, :] = np.zeros(dims)
                    mut_seq[0, position1, 0, nuc1] = 1.0
                    mut_seq[0, position2, 0, :] = np.zeros(dims)
                    mut_seq[0, position2, 0, nuc2] = 1.0

                    mutations_matrix[i1, i2, (nuc1*dims)+nuc2, :] = mut_seq

    return mutations_matrix

#passing in the whole list and return a list of the list split up
def split_list(L, numsplit=4):
    splitidx = [(len(L)//numsplit)*i for i in range(numsplit)]
    splitidx.append(len(L))
    splitup = [L[splitidx[f]:splitidx[f+1]] for f in range(numsplit)]
    return (splitup, splitup)

In [27]:
#Mofidied Generic that allows a single seq SoM to be broken up
def som_average_ungapped_split(Xdict, ungapped_index, savepath, nntrainer, sess, split=4, progress='on', save=True, layer='output',
                         normalize=False):

    num_summary, seqlen, _, dims = Xdict.shape

    starttime = time.time()

    idxlen = len(ungapped_index)

    #initialize the full array to hold all the scores before averaging
    sum_mut2_scores = np.zeros((num_summary, idxlen, idxlen, dims, dims))

    for ii in range(num_summary):
        if progress == 'on':
            print (ii)

        epoch_starttime = time.time()

        #extract sequence
        X = np.expand_dims(Xdict[ii], axis=0)
        #Get WT score
        WT = {'inputs': X, 'targets': np.ones((X.shape[0], 1))}
        WT_score = nntrainer.get_activations(sess, WT, layer=layer)[0]

        #initialize temp array per sequence
        temp_scores = np.zeros((idxlen, idxlen, dims, dims))
        #split up the idxlen into managebale bits
        idxs_r, idxs_c = split_list(ungapped_index, numsplit=split)

        #initialize the start index of the temp array
        start_r =0
        for pi,r in enumerate(idxs_r):
            start_c = 0
            for pj,c in enumerate(idxs_c):
                X_mutsecorder = double_mutate_ungapped_split(X, r, c)
                #reshape the 6D tensor into a 4D tensor that the model can test
                X_mutsecorder_reshape = np.reshape(X_mutsecorder, (len(r)*len(c)*dims*dims, seqlen, 1, dims))
                mutations = {'inputs': X_mutsecorder_reshape, 'targets': np.ones((X_mutsecorder_reshape.shape[0], 1))}
                #Get output activations for the mutations
                mut2_scores= nntrainer.get_activations(sess, mutations, layer=layer)

                #logodds regime:
                if normalize == 'logodds':
                    minscore = np.min(mut2_scores)
                    #mut2_scores = np.log(np.clip(mut2_scores, a_min=0., a_max=1e7) + 1e-7) - np.log(WT_score+1e-7)
                    mut2_scores = np.log(mut2_scores - minscore + 1e-7) - np.log(WT_score-minscore+1e-7)
                #Reshape and add back to temp array
                
                temp_scores[start_r:start_r+len(r), start_c:start_c+len(c), :, :] = mut2_scores.reshape(len(r),len(c),dims,dims)
                #update the temp array start index
                start_c = start_c+len(c)
            start_r = start_r+len(r)
                
        #Add the temp to the full scores
        sum_mut2_scores[ii] = temp_scores

        epoch_endtime = time.time()

        if progress == 'on':

            print ('Epoch duration =' + mf.sectotime(epoch_endtime -epoch_starttime))
            print ('Cumulative duration =' + mf.sectotime(epoch_endtime - starttime))
            print ()

        if progress == 'short':
            if ii%100 == 0:
                print (ii)
                print ('Epoch duration =' + mf.sectotime((epoch_endtime -epoch_starttime)*100))
                print ('Cumulative duration =' + mf.sectotime(epoch_endtime - starttime))
                print ()

    print ('----------------Summing complete----------------')

    mean_mut2_scores = np.nanmean(sum_mut2_scores, axis=0)

    # Save the summed array for future use
    if save == True:
        np.save(savepath, mean_mut2_scores)
        print ('Saving scores to ' + savepath)


    return (mean_mut2_scores)

In [28]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, sys, h5py
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import tensorflow as tf
import scipy

import sys
sys.path.append('../../..')
import mutagenesisfunctions as mf
import helper
from deepomics import neuralnetwork as nn
from deepomics import utils, fit, visualize, saliency

from Bio import AlignIO
import time as time
import pandas as pd
#---------------------------------------------------------------------------------------------------------------------------------
'''DEFINE ACTIONS'''
TRAIN = False
TEST = False
WRITE = False
FOM = False
SOMCALC = True
SOMVIS = False

if '--train' in sys.argv:
  TRAIN = True
if '--test' in sys.argv:
  TEST = True
if '--write' in sys.argv:
  WRITE = True
if '--fom' in sys.argv:
  FOM = True
if '--somcalc' in sys.argv:
  SOMCALC = True
if '--somvis' in sys.argv:
  SOMVIS = True

#---------------------------------------------------------------------------------------------------------------------------------
'''DEFINE LOOP'''

exp = 'trnasec'  #for the params folder


img_folder = 'Images_mlp'
#---------------------------------------------------------------------------------------------------------------------------------

'''OPEN DATA'''

starttime = time.time()

#Open data from h5py
filename = 'trnasec_full.hdf5'
with h5py.File(filename, 'r') as dataset:
    X_data = np.array(dataset['X_data'])
    Y_data = np.array(dataset['Y_data'])

numdata, seqlen, _, dims = X_data.shape
dims = dims-1

#remove gaps from sequences
ungapped = True
if ungapped:
    X_data = X_data[:, :, :, :dims]

# get validation and test set from training set
train_frac = 0.8
valid_frac = 0.1
test_frac = 1-0.8-valid_frac
N = numdata
posidx = np.random.permutation(np.arange(N//2))
negidx = np.random.permutation(np.arange(N//2, N))
split_1 = int((N//2)*(1-valid_frac-test_frac))
split_2 = int((N//2)*(1-test_frac))
#shuffle = np.random.permutation(N)

trainidx = np.random.permutation(np.concatenate([posidx[:split_1], negidx[:split_1]]))
valididx = np.random.permutation(np.concatenate([posidx[split_1:split_2], negidx[split_1:split_2]]))
testidx = np.random.permutation(np.concatenate([posidx[split_2:], negidx[split_2:]]))

#set up dictionaries
train = {'inputs': X_data[trainidx],
         'targets': Y_data[trainidx]}
valid = {'inputs': X_data[valididx],
         'targets': Y_data[valididx]}
test = {'inputs': X_data[testidx],
         'targets': Y_data[testidx]}

print ('Data extraction and dict construction completed in: ' + mf.sectotime(time.time() - starttime))

simalign_file = 'trnasec_full.sto'
#Get the full secondary structure and sequence consensus from the emission
SS = mf.getSSconsensus(simalign_file)
SQ = mf.getSQconsensus(simalign_file)

#Get the ungapped sequence and the indices of ungapped nucleotides
_, ugSS, ugidx = mf.rm_consensus_gaps(X_data, SS)
_, ugSQ, _ = mf.rm_consensus_gaps(X_data, SQ)


#Get the sequence and indices of the conserved base pairs
bpchars = ['(',')','<','>','{','}']
sig_bpchars = ['<','>']
bpidx, bpSS, nonbpidx = mf.sigbasepair(SS, bpchars)
numbp = len(bpidx)
numug = len(ugidx)

#Get the bpug information
bpugSQ, bpugidx = mf.bpug(ugidx, bpidx, SQ)
#---------------------------------------------------------------------------------------------------------------------------------


'''SAVE PATHS AND PARAMETERS'''
params_results = '../../results'

modelarch = 'mlp'
trial = 'trnasec_full'
modelsavename = '%s_%s'%(modelarch, trial)



'''BUILD NEURAL NETWORK'''

def cnn_model(input_shape, output_shape):


  # create model
  layer1 = {'layer': 'input', #41
          'input_shape': input_shape
          }

  layer2 = {'layer': 'dense',        # input, conv1d, dense, conv1d_residual, dense_residual, conv1d_transpose,
                                      # concat, embedding, variational_normal, variational_softmax, + more
            'num_units': 44,
            'norm': 'batch',          # if removed, automatically adds bias instead
            'activation': 'relu',     # or leaky_relu, prelu, sigmoid, tanh, etc
            'dropout': 0.5,           # if removed, default is no dropout
           }

  layer3 = {'layer': 'dense',
          'num_units': output_shape[1],
          'activation': 'sigmoid'
          }

  model_layers = [layer1, layer2, layer3]

  # optimization parameters
  optimization = {"objective": "binary",
                "optimizer": "adam",
                "learning_rate": 0.0003,
                "l2": 1e-5,
                #"label_smoothing": 0.05,
                #"l1": 1e-6,
                }
  return model_layers, optimization

tf.reset_default_graph()

# get shapes of inputs and targets
input_shape = list(train['inputs'].shape)
input_shape[0] = None
output_shape = train['targets'].shape

# load model parameters
model_layers, optimization = cnn_model(input_shape, output_shape)

# build neural network class
nnmodel = nn.NeuralNet(seed=247)
nnmodel.build_layers(model_layers, optimization)

# compile neural trainer
save_path = os.path.join(params_results, exp)
param_path = os.path.join(save_path, modelsavename)
nntrainer = nn.NeuralTrainer(nnmodel, save='best', file_path=param_path)



#---------------------------------------------------------------------------------------------------------------------------------

'''TRAIN '''
if TRAIN:
  # initialize session
  sess = utils.initialize_session()

  #Train the model

  data = {'train': train, 'valid': valid}
  fit.train_minibatch(sess, nntrainer, data,
                    batch_size=100,
                    num_epochs=100,
                    patience=100,
                    verbose=2,
                    shuffle=True,
                    save_all=False)


  sess.close()

  #---------------------------------------------------------------------------------------------------------------------------------
'''TEST'''
sess = utils.initialize_session()
if TEST:

  # set best parameters
  nntrainer.set_best_parameters(sess)

  # test model
  loss, mean_vals, std_vals = nntrainer.test_model(sess, test, name='test')
  if WRITE:
    metricsline = '%s,%s,%s,%s,%s,%s,%s'%(exp, modelarch, trial, loss, mean_vals[0], mean_vals[1], mean_vals[2])
    fd = open('test_metrics.csv', 'a')
    fd.write(metricsline+'\n')
    fd.close()
'''SORT ACTIVATIONS'''
nntrainer.set_best_parameters(sess)
predictionsoutput = nntrainer.get_activations(sess, test, layer='output')
plot_index = np.argsort(predictionsoutput[:,0])[::-1]

#---------------------------------------------------------------------------------------------------------------------------------
'''FIRST ORDER MUTAGENESIS'''
if FOM:
  plots = 3
  num_plots = range(plots)
  fig = plt.figure(figsize=(15,plots*2+1))
  for ii in num_plots:

      X = np.expand_dims(test['inputs'][plot_index[10000+ii]], axis=0)

      ax = fig.add_subplot(plots, 1, ii+1)
      mf.fom_saliency_mul(X, layer='dense_1_bias', alphabet='rna', nntrainer=nntrainer, sess=sess, ax =ax)
      fom_file = modelsavename + 'FoM' + '.png'
  fom_file = os.path.join(img_folder, fom_file)
  plt.savefig(fom_file)

  plt.close()
#---------------------------------------------------------------------------------------------------------------------------------
'''SECOND ORDER MUTAGENESIS'''

'''Som calc'''
if SOMCALC:
  num_summary = 2 #np.min([500,len(test['inputs'])//2])
  print (num_summary)

  arrayspath = 'Arrays/%s_%s%s_so%.0fk.npy'%(exp, modelarch, trial, num_summary/1000)
  Xdict = test['inputs'][plot_index[:num_summary]]

  mean_mut2 = som_average_ungapped_split(Xdict, ugidx, arrayspath, nntrainer, sess, split=4, progress='short',
                                             save=True, layer='dense_1_bias')

if SOMVIS:
  #Load the saved data
  num_summary = 2 #np.min([500,len(test['inputs'])//2])
  arrayspath = 'Arrays/%s_%s%s_so%.0fk.npy'%(exp, modelarch, trial, num_summary/1000)
  mean_mut2 = np.load(arrayspath)

  #Reshape into a holistic tensor organizing the mutations into 4*4
  meanhol_mut2 = mean_mut2.reshape(numug,numug,4,4)

  #Normalize
  normalize = True
  if normalize:
      norm_meanhol_mut2 = mf.normalize_mut_hol(meanhol_mut2, nntrainer, sess, normfactor=1)

  #Let's try something weird
  bpfilter = np.ones((4,4))*0.
  for i,j in zip(range(4), range(4)):
      bpfilter[i, -(j+1)] = +1.

  nofilter = np.ones((4,4))

  C = (norm_meanhol_mut2*bpfilter)
  C = np.sum((C).reshape(numug,numug,dims*dims), axis=2)
  C = C - np.mean(C)
  C = C/np.max(C)

  plt.figure(figsize=(8,6))
  sb.heatmap(C,vmin=None, cmap='Blues', linewidth=0.0)
  plt.title('Base Pair scores: %s %s %s'%(exp, modelarch, trial))

  som_file = modelsavename + 'SoM_bpfilter_train' + '.png'
  som_file = os.path.join(img_folder, som_file)
  plt.savefig(som_file)
  plt.close()


Data extraction and dict construction completed in: 0.05s
loading model from:  ../../results/trnasec/mlp_trnasec_full_best.ckpt
INFO:tensorflow:Restoring parameters from ../../results/trnasec/mlp_trnasec_full_best.ckpt
2
22 22 4 4
22 22 4 4
22 22 4 4
22 25 4 4
22 22 4 4
22 22 4 4
22 22 4 4
22 25 4 4
22 22 4 4
22 22 4 4
22 22 4 4
22 25 4 4
25 22 4 4
25 22 4 4
25 22 4 4
25 25 4 4
0
Epoch duration =1min 38.269999999999996s
Cumulative duration =0.98s

22 22 4 4
22 22 4 4
22 22 4 4
22 25 4 4
22 22 4 4
22 22 4 4
22 22 4 4
22 25 4 4
22 22 4 4
22 22 4 4
22 22 4 4
22 25 4 4
25 22 4 4
25 22 4 4
25 22 4 4
25 25 4 4
----------------Summing complete----------------
Saving scores to Arrays/trnasec_mlptrnasec_full_so0k.npy


In [12]:
mean_mut2

array([[[[15.50099802, 15.68667603, 15.21925163, 16.57904863],
         [15.50099802, 15.68667603, 15.21925163, 16.57904863],
         [15.50099802, 15.68667603, 15.21925163, 16.57904863],
         [15.50099802, 15.68667603, 15.21925163, 16.57904863]],

        [[ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ]],

        [[ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ]],

        ...,

        [[ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,

In [23]:
len(ugidx)//4

22