In [1]:
import numpy as np
np.random.seed(123)

import pandas as pd
from math import sqrt, ceil

import h5py

from sklearn.utils import shuffle
import tensorflow as tf

from tfbio.data import Featurizer, make_grid, rotate
import net_3 as net ## custom network for predicting ic50

import os.path

import matplotlib as mpl
mpl.use('agg')

import seaborn as sns
sns.set_style('white')
sns.set_context('paper')
sns.set_color_codes()
color = {'training': 'b', 'validation': 'g', 'test': 'r'}


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
input_dir = 'dataset/train_valid_test_data/exp3/'
datasets = ['training', 'validation', 'test']

batch_size = 20
conv_channels=[64, 128, 256]
dense_sizes=[1000, 500, 200]
conv_patch=5
pool_patch=2
lmbda=0.001
learning_rate=1e-5
to_keep = 10

splitted_datasets = ['training1', 'training2','validation1','validation2', 'test1','test2']
protein_list = ['andro','estro']

In [10]:
featurizer = Featurizer()

print('\n---- FEATURES ----\n')
print('atomic properties:', featurizer.FEATURE_NAMES)

columns = {name: i for i, name in enumerate(featurizer.FEATURE_NAMES)}

ids = {}
toxicity = {}
coords = {}
features = {}

splitted_ids = {}
splitted_toxicity = {}
splitted_coords = {}
splitted_features = {}

for dictionary in [ids, toxicity, coords, features]:
    for dataset_name in datasets:
        dictionary[dataset_name] = []
        
for dictionary in [splitted_ids, splitted_toxicity, splitted_coords, splitted_features]:
    for splitted_dataset_name in splitted_datasets:
        dictionary[splitted_dataset_name] = []

for dataset_name in datasets:
    dataset_path = os.path.join(input_dir, '%s_set.hdf' % dataset_name)
    with h5py.File(dataset_path, 'r') as f:
        for pdb_id in f: #pdb_id  >>> androgenSDF0
            dataset = f[pdb_id]
            for i in range(len(protein_list)):
                if protein_list[i] in pdb_id : 
                    splitted_coords[dataset_name + str(i+1)].append(dataset[:, :3])
                    splitted_features[dataset_name + str(i+1)].append(dataset[:, 3:])
                    splitted_toxicity[dataset_name + str(i+1)].append(dataset.attrs['toxicity'])
                    splitted_ids[dataset_name + str(i+1)].append(pdb_id) 
    
for k in splitted_ids.keys():
    splitted_ids[k] = np.array(splitted_ids[k])
    splitted_toxicity[k] = np.reshape(splitted_toxicity[k], (-1, 1))
        




---- FEATURES ----

atomic properties: ['B', 'C', 'N', 'O', 'P', 'S', 'Se', 'halogen', 'metal', 'hyb', 'heavyvalence', 'heterovalence', 'partialcharge', 'molcode', 'hydrophobic', 'aromatic', 'acceptor', 'donor', 'ring']


In [20]:
len(splitted_ids['test2'])

10

In [11]:
x1, x2 = [], []
protein_list = ['andro','estro']
for i in range(len(ids['training'])):
    for prot in protein_list : 
        if np.str.find(ids['training'][i],prot) == 0:

        
count

IndentationError: expected an indented block (<ipython-input-11-cd76b4f606d5>, line 8)

In [None]:
tmp = np.str.find(ids['training'][10],'androgen')
tmp

In [None]:
# ids / toxicity / coords / features
def task_split(dataset_name, protein_list) :
    x1, x2 = [], []
    for i, in range(len(ids[dataset_name])):
        for prot in protein_list: 
            if
        
    

In [None]:
charges = []
for feature_data in features['training']:
    charges.append(feature_data[..., columns['partialcharge']])

charges = np.concatenate([c.flatten() for c in charges])

m = charges.mean()
std = charges.std()
print('charges: mean=%s, sd=%s' % (m, std))
print('use sd as scaling factor')

In [None]:
def get_batch(dataset_name, indices, rotation=0):
    global coords, features, std
    x = []
    for i, idx in enumerate(indices):
        coords_idx = rotate(coords[dataset_name][idx], rotation)
        features_idx = features[dataset_name][idx]
        x.append(make_grid(coords_idx, features_idx,
                 grid_resolution=1.0,
                 max_dist=10.0))
    x = np.vstack(x)
    x[..., columns['partialcharge']] /= std
    return x

In [None]:
print('\n---- DATA ----\n')

tmp = get_batch('training', range(min(50, len(features['training']))))
print(tmp.shape)

In [None]:
assert ((tmp[:, :, :, :, columns['molcode']] == 0.0).any()
        and (tmp[:, :, :, :, columns['molcode']] == 1.0).any()
        and (tmp[:, :, :, :, columns['molcode']] == -1.0).any()).all()

idx1 = [[i[0]] for i in np.where(tmp[:, :, :, :, columns['molcode']] == 1.0)]
idx2 = [[i[0]] for i in np.where(tmp[:, :, :, :, columns['molcode']] == -1.0)]

print('\nexamples:')
for mtype, mol in [['ligand', tmp[idx1]], ['protein', tmp[idx2]]]:
    print(' ', mtype)
    for name, num in columns.items():
        print('  ', name, mol[0, num])
    print('')

In [None]:
t_baseline = ((toxicity['training'] - toxicity['training'].mean()) ** 2.0).mean()
v_baseline = ((toxicity['validation'] - toxicity['training'].mean()) ** 2.0).mean()
print('baseline mse: training=%s, validation=%s' % (t_baseline, v_baseline))

In [None]:
# NET PARAMS

ds_sizes = {dataset: len(toxicity[dataset]) for dataset in datasets}
_, isize, *_, in_chnls = get_batch('training', [0]).shape
osize = 1

for set_name, set_size in ds_sizes.items():
    print('%s %s samples' % (set_size, set_name))

num_batches = {dataset: ceil(size / batch_size)
               for dataset, size in ds_sizes.items()}

print(num_batches)
# == ... == # 


In [None]:
ds_sizes['training']

In [None]:
graph = net.make_SB_network(isize=isize, in_chnls=in_chnls, osize=osize,
                                  conv_patch=5,
                                  pool_patch=2,
                                  conv_channels=[64, 128, 256],
                                  dense_sizes=[1000, 500, 200],
                                  lmbda=0.001,
                                  learning_rate=1e-5)

#train_writer = tf.summary.FileWriter(os.path.join(logdir, 'training_set'),
#                                     graph, flush_secs=1)
#val_writer = tf.summary.FileWriter(os.path.join(logdir, 'validation_set'),
#                                   flush_secs=1)

net_summaries, training_summaries = net.make_summaries_SB(graph)

x = graph.get_tensor_by_name('input/structure:0')
y = graph.get_tensor_by_name('output/prediction:0')
t = graph.get_tensor_by_name('input/toxicity:0')
keep_prob = graph.get_tensor_by_name('fully_connected/keep_prob:0')
train = graph.get_tensor_by_name('training/train:0')  

#train1 = graph.get_tensor_by_name('training/train1:0')  
#train2 = graph.get_tensor_by_name('training/train2:0')  
# graph.get_tensor_by_name => bring tensors from a certain variable scope by using name
# this code is in the vriable_scope('training') in net_3.py 
#    >>> train = optimizer.minimize(cost, global_step=global_step,name='train')
#
mse = graph.get_tensor_by_name('training/mse:0')
#mse2 = graph.get_tensor_by_name('training/mse2:0')
feature_importance = graph.get_tensor_by_name('net_properties/feature_importance:0')
global_step = graph.get_tensor_by_name('training/global_step:0')


In [None]:
x

In [None]:
convs = '_'.join((str(i) for i in conv_channels))
fcs = '_'.join((str(i) for i in dense_sizes))

with graph.as_default():
    saver = tf.train.Saver(max_to_keep=to_keep)


    
def batches(set_name):
    """Batch generator, yields slice indices"""
    global num_batches, args, ds_sizes 
    # num_batches = how many batches in each dataset(train, valid, test)
    # ds_sizes = dataset_sizes 
    for b in range(num_batches[set_name]):
        bi = b * batch_size # one batch mul batch_size 
        bj = (b + 1) * batch_size 
        if b == num_batches[set_name] - 1:
            bj = ds_sizes[set_name] # maybe only remainer set
        yield bi, bj

err = float('inf')

train_sample = min(batch_size, len(features['training']))
val_sample = min(batch_size, len(features['validation']))



In [None]:
def task_split(dataset_name, indices, rotation=0):
    global coords, 

In [22]:
err = float('inf')
err

inf