### Prepossessing

In [3]:
import numpy as np
import pandas as pd

df = pd.read_parquet('Metal_all_20180116.snappy.parquet')
# print ('***** Data Types *****' + '\n' + str(df.dtypes) + '\n\n' + 
#        '***** Unique Ligands *****' + '\n' + str(df.ligandId.unique()))

# Extract zinc-binded, single-chained protein sequences
df_zn = df.loc[df['ligandId'] == 'ZN']
df_zn_single = df_zn.loc[df_zn['interactingChains'] == 1]
seqs = np.array(df_zn_single.sequence)
target = np.array(df_zn_single.fingerprint)

del df,df_zn,df_zn_single

# Remove seqs containing 'U' and 'X'
rows_to_delete = []
for i in range(seqs.shape[0]):
    if 'X' in seqs[i] or 'U' in seqs[i]:
#         print('Removing...' + str(i))
        rows_to_delete.append(i)        
        
seqs = np.delete(seqs, rows_to_delete, 0)
target = np.delete(target, rows_to_delete)
print ("Sequence length is " + str(seqs.shape[0]))

Sequence length is 22784


In [4]:
# Filepaths
dict_path = "./dictionaries/"
model_path = "./models/"
hist_path = "./histories/"
fig_path = "./figs/"

In [5]:
# Load
seqs_dict = {}
with open(dict_path + "seq_n_gram_to_vec_dict_w_UX", 'r') as fp:
        seqs_dict = json.load(fp)

seqs_dict_onehot = {}
with open(dict_path + "seqs_dict_onehot", 'r') as fp:
        seqs_dict_onehot = json.load(fp)

#### Data Generator

In [6]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

# Loader
def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path

    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path
        
class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)

        print ("importing Jupyter notebook from %s" % path)

        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = read(f, 4)


        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
            for cell in nb.cells:
                if cell.cell_type == 'code':
                    # transform the input to executable Python
                    code = self.shell.input_transformer_manager.transform_cell(cell.source)
                    # run the code in themodule
                    exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod

# Finder
class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]
    
print ('Hooking notebook finder...')

Hooking notebook finder...


In [7]:
sys.meta_path.append(NotebookFinder())
print ('Ready!')

Ready!


In [8]:
import DataGenerator

importing Jupyter notebook from DataGenerator.ipynb


Using TensorFlow backend.


In [9]:
kwargs = {'sequences': seqs,
          'labels': target,
          'translator': seqs_dict_onehot,
          'batch_size': 5,
          'input_shape': (706, 20),
          'label_shape': (706, 1),
          'shuffle': False}

In [10]:
print (target[0:4])

[array([ 50, 101, 368, 369]) array([326, 330, 411]) array([101, 128, 131])
 array([ 89, 118, 177])]


In [11]:
D = DataGenerator.OneHotGenerator(**kwargs)

In [12]:
X,y = D.__getitem__(0)

X_shape (5, 706, 20)
y_shape (5, 706, 1)


In [22]:
print (X[0,0,])
print (seqs[0][0])
print (seqs_dict_onehot['T'])

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
  0.  0.]
T
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


### Input

In [None]:
class myInput():
    def _init_(self, generator):
        self.generator = generator
        self.params = generator.get_params()
        
    def get_params(self):
        return self.params

### Model

In [None]:
model_params = {'num_of_inputs' : 1,
                'output_dim' : 706,
                'optimizer' : Adam(),
                'callback' : myCallback
                'fold_size' : 500,
                'epoch_size' : 10,
                'loss' : 'binary_crossentropy'}

In [None]:
class myModel():
    def _init_(self, myInput, userModel, model_params = {}):
        self.config = myInput.get_config
        self.model_param = model_params
        self.userModel = userModel
        
    def create_model(self):
        
    def train(self):
        self.create_model()

### Evaluation