In [2]:
import tensorflow as tf

import numpy as np
import requests as rq
import io, h5py, os

import matplotlib.pyplot as plt

In [3]:
data = rq.get('https://www.dropbox.com/s/c3umbo5y13sqcfp/synthetic_dataset.h5?raw=true')
data.raise_for_status()

with h5py.File(io.BytesIO(data.content), 'r') as dataset:
    x_train = np.array(dataset['X_train']).astype(np.float32).transpose([0, 2, 1])
    y_train = np.array(dataset['Y_train']).astype(np.float32)
    x_valid = np.array(dataset['X_valid']).astype(np.float32).transpose([0, 2, 1])
    y_valid = np.array(dataset['Y_valid']).astype(np.int32)
    x_test = np.array(dataset['X_test']).astype(np.float32).transpose([0, 2, 1])
    y_test = np.array(dataset['Y_test']).astype(np.int32)

In [4]:
directories = [i[0] for i in os.walk('models')][1:]
models = [i[2] for i in os.walk('models')][1:]

In [5]:
def get_ppms(model):
    layer = 3        # activation layer for 1st convolutional layer
    threshold = 0.5  # threshold for significant activations
    window = 20      # window size of alignment 

    # get feature maps of 1st convolutional layer after activation
    intermediate = tf.keras.Model(inputs=model.inputs, outputs=model.layers[layer].output)
    fmap = intermediate.predict(x_test)
    num_filters = fmap.shape[-1]

    # set the left and right window sizes
    window_left = int(window/2)
    window_right = window - window_left

    N, L, A = x_test.shape

    ppms = []
    for filter_index in range(num_filters):

        # find regions above threshold
        coords = np.where(fmap[:,:,filter_index] > np.max(fmap[:,:,filter_index])*threshold)
        x, y = coords

        # sort score
        index = np.argsort(fmap[x,y,filter_index])[::-1]
        data_index = x[index].astype(int)
        pos_index = y[index].astype(int)

        # make a sequence alignment centered about each activation (above threshold)
        seq_align = []
        for i in range(len(pos_index)):

            # determine position of window about each filter activation
            start_window = pos_index[i] - window_left
            end_window = pos_index[i] + window_right

            # check to make sure positions are valid
            if (start_window > 0) & (end_window < L):
                seq = x_test[data_index[i], start_window:end_window, :]
                seq_align.append(seq)

        # calculate position probability matrix

        ppm = np.mean(seq_align, axis=0)

        # splice positions with uniform probability
        if len(np.where(ppm > 0.4)) > 1:
            x, y = np.where(ppm > 0.4)
            if x.shape[0] > 0:
                low = max(x[0]-1, 0)
                high = min(x[-1]+1, ppm.shape[0])
                ppm = ppm[low:high]

        ppms.append(ppm)
    ppms = np.array(ppms)
    
    return ppms

In [6]:
for i in range(len(directories)):
    print(i / len(directories) * 100)
    for j in range(len(models[i])):
        model = tf.keras.models.load_model(h5py.File(os.path.join(directories[i], models[i][j]), 'r'))
        ppms = get_ppms(model)
        for k in range(ppms.shape[0]):
            
            model_path = os.path.join('motifs', directories[i][directories[i].index("\\")+1:], models[i][j][:models[i][j].index('.h5')])
            if not os.path.exists(model_path):
                os.makedirs(model_path)
            
            model_path = os.path.join(model_path, f'filter-{k+1}.txt')
            open(model_path, 'w+').close()
            
            motif = ppms[i]
            out = []

            out.append("MEME version 4\n\n")

            out.append("ALPHABET= ACGT\n\n")

            out.append("strands: + -\n\n")

            out.append("Background letter frequencies\n")
            out.append("A 0.25 C 0.25 G 0.25 T 0.25\n\n")

            out.append(f"MOTIF filter-{k+1}\n")
            out.append(f"letter-probability matrix: alength= 4 w= {ppms[k].shape[0]}\n")

            for l in motif:
                out.append("%.4f %.4f %.4f %.4f\n" % (l[0], l[1], l[2], l[3]))

            with open(model_path, 'w') as file:
                file.writelines(out)

0.0
(32, 20, 4)


  ppms = np.array(ppms)


(32,)
8.333333333333332
(32,)


KeyboardInterrupt: 