## Preprocessing

In [1]:
user = 'Zihan'
model_name = 'CNN_Type'

In [2]:
import sys
import json

print ("Initializing global variables...", end=' ')
sys.stdout.flush()

# Filepaths
dict_path = './' + user + '/dictionaries/'
model_path = './' + user + '/models/'
hist_path = './' + user + '/histories/'
fig_path = './' + user + '/figs/'

# shared result file
output_file = './results.txt'

print ("Done")
print ("  Filepath set to " + user + "'s directory")

##################################################

print ("Importing modules...", end=' ')
import modules_type
print ("Done")

##################################################

print ("Reading data from disk...", end=' ')
sys.stdout.flush()

import numpy as np
import pandas as pd

df = pd.read_parquet('./Tian/Metal_all_20180116.snappy.parquet')

# Extract zinc-binded, single-chained protein sequences
df_zn = df.loc[df['ligandId'] == 'ZN']
df_zn_single = df_zn.loc[df_zn['interactingChains'] == 1]
seqs = np.array(df.sequence)
target = np.array(df.ligandId)
# del df,df_zn,df_zn_single

# Remove seqs containing 'U' and 'X'

duplicate_dict = {}

rows_to_delete = []
for i in range(seqs.shape[0]):
    if 'X' in seqs[i] \
    or 'U' in seqs[i] \
    or '3CO' in target[i]\
    or '3NI' in target[i] \
    or 'FE2'in target[i] \
    or 'CU1'in target[i]\
    or 'MN3' in target[i]:
        rows_to_delete.append(i)   
    elif seqs[i] not in duplicate_dict.keys():
        duplicate_dict[seqs[i]] = target[i]
    else:
        if target[i] != duplicate_dict[seqs[i]]:
            rows_to_delete.append(i)
#             print ('Deleting', i)
    
        
seqs = np.delete(seqs, rows_to_delete, 0)
target = np.delete(target, rows_to_delete)

print (set(target))

label_dict ={}
for i, j in enumerate(set(df.ligandId)):
    label_dict[j] = i

for i in range(target.shape[0]):
    target[i] = [label_dict[target[i]]]


    

print ("Done")

##################################################

print ("Loading dictionaries...", end=' ')
sys.stdout.flush()

# FOFE
vocab_dic_fofe = {}
with open(dict_path + "vocab_dict_fofe", 'r') as fp:
        vocab_dic_fofe = json.load(fp)

print ("Done")

##################################################

print ("Performing cross validation split...", end=' ')
ratio = 0.9
split = int(ratio*len(seqs))
train_seqs, val_seqs = seqs[:split], seqs[split:]
train_label, val_label = target[:split], target[split:]
print ("Done")
print ("  Ratio :", ratio)
print ("  Train_range :", 0, "-", split-1)
print ("  Val_range :", split, "-", len(seqs)-1)

Initializing global variables... Done
  Filepath set to Zihan's directory
Importing modules... 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Done
Reading data from disk... {'NI', 'CA', 'ZN', 'MG', 'MN', 'CO', 'FE', 'CU'}
Done
Loading dictionaries... Done
Performing cross validation split... Done
  Ratio : 0.9
  Train_range : 0 - 52391
  Val_range : 52392 - 58213


In [3]:
df.groupby('ligandId').count()

Unnamed: 0_level_0,structureChainId,fingerprint,groupNumber,sequence,interactingChains,clusterNumber30,clusterNumber40,clusterNumber50,clusterNumber70,clusterNumber90,clusterNumber95,clusterNumber100
ligandId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3CO,46,46,46,46,46,46,46,46,46,46,46,46
3NI,34,34,34,34,34,34,34,34,34,34,34,34
CA,19236,19236,19236,19236,19236,19219,19219,19219,19219,19219,19219,19219
CO,988,988,988,988,988,988,988,988,988,988,988,988
CU,2582,2582,2582,2582,2582,2582,2582,2582,2582,2582,2582,2582
CU1,337,337,337,337,337,337,337,337,337,337,337,337
FE,3359,3359,3359,3359,3359,3356,3356,3356,3356,3356,3356,3356
FE2,1649,1649,1649,1649,1649,1649,1649,1649,1649,1649,1649,1649
MG,17633,17633,17633,17633,17633,9079,9079,9079,9079,9079,9079,9079
MN,5771,5771,5771,5771,5771,5771,5771,5771,5771,5771,5771,5771


## Data Generator

- <font color=blue>FOFE Encoding</font>

In [4]:
train_args = {'sequences': train_seqs,
              'labels': train_label,
              'translator': vocab_dic_fofe}
val_args = {'sequences': val_seqs,
            'labels': val_label,
            'translator': vocab_dic_fofe}
common_args = {'batch_size': 100,
               'input_shape': (800,),
               'label_shape': (13, ),
               'shuffle': True}

train_gen_0 = modules_type.FOFEGenerator(**train_args, **common_args)
val_gen_0 = modules_type.FOFEGenerator(**val_args, **common_args)

## Data Generator

- <font color=blue>FOFE Encoding</font>

In [None]:
train_args = {'sequences': train_seqs,
              'labels': train_label,
              'translator': vocab_dic_fofe}
val_args = {'sequences': val_seqs,
            'labels': val_label,
            'translator': vocab_dic_fofe}
common_args = {'batch_size': 100,
               'input_shape': (800,),
               'label_shape': (706, ),
               'shuffle': True}

train_gen_1 = modules.FOFEGenerator(**train_args, **common_args)
val_gen_1 = modules.FOFEGenerator(**val_args, **common_args)

In [5]:
# train_gen = modules.GeneratorArray([train_gen_0,train_gen_1])
# val_gen = modules.GeneratorArray([val_gen_0,val_gen_1])

train_gen = train_gen_0
val_gen = val_gen_0

## Model
- <font color=blue>CNN</font>

In [None]:
# ProtVec:100, One-hot:20, blosum62:20, property:7
dimension = 800
cutoff = 706

import tensorflow as tf
import time
import matplotlib.pyplot as plt
% matplotlib inline
np.random.seed(2017) 
from keras.models import Sequential, Model
from keras.layers.convolutional import Convolution2D, MaxPooling2D, Convolution1D, MaxPooling1D, AveragePooling2D
from keras.layers import Activation, Flatten, Dense, Dropout, Reshape, Embedding, Input
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.optimizers import SGD
import numpy as np
import keras
from keras.models import Model, load_model
from keras.optimizers import Adam, SGD, RMSprop
# Visualization
from keras.utils import plot_model

input_shape = (dimension,)

input_0 = Input(shape=input_shape, dtype='float32')
input_0_reshape = Reshape((1,dimension,1), input_shape=(dimension,))(input_0)
conv2d_3 = Convolution2D(2, 1, 3, border_mode='same')(input_0_reshape)
conv2d_5 = Convolution2D(2, 1, 5, border_mode='same')(input_0_reshape)
conv2d_7 = Convolution2D(2, 1, 7, border_mode='same')(input_0_reshape)

x = keras.layers.concatenate([conv2d_3,conv2d_5,conv2d_7])
x = Activation('relu')(x)
x = Flatten()(x)
x = Dense(cutoff, activation='relu')(x)
output_0 = Dense(cutoff, activation='softmax')(x)
#output_0_reshape = Reshape((cutoff,1), input_shape=(cutoff,))(output_0)

#model = Model(inputs=input_0, outputs=output_0_reshape)
model = Model(inputs=input_0, outputs=output_0)                              
# end of the MODEL

sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [6]:
# ProtVec:100, One-hot:20, blosum62:20, property:7
dimension = 800
cutoff = 13

import tensorflow as tf
import time
import matplotlib.pyplot as plt
% matplotlib inline
np.random.seed(2017) 
from keras.models import Sequential, Model
from keras.layers.convolutional import Convolution2D, MaxPooling2D, Convolution1D, MaxPooling1D, AveragePooling2D
from keras.layers import Activation, Flatten, Dense, Dropout, Reshape, Embedding, Input
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.optimizers import SGD
import numpy as np
import keras
from keras.models import Model, load_model
from keras.optimizers import Adam, SGD, RMSprop
# Visualization
from keras.utils import plot_model

input_shape = (dimension,)

input_0 = Input(shape=input_shape, dtype='float32')
input_0_reshape = Reshape((1,dimension,1), input_shape=(dimension,))(input_0)
conv2d_3 = Convolution2D(2, 1, 3, border_mode='same')(input_0_reshape)
conv2d_5 = Convolution2D(2, 1, 5, border_mode='same')(input_0_reshape)
conv2d_7 = Convolution2D(2, 1, 7, border_mode='same')(input_0_reshape)

x = keras.layers.concatenate([conv2d_3,conv2d_5,conv2d_7])
x = Activation('relu')(x)
x = Flatten()(x)
x = Dense(cutoff, activation='relu')(x)
output_0 = Dense(cutoff, activation='softmax')(x)
#output_0_reshape = Reshape((cutoff,1), input_shape=(cutoff,))(output_0)

#model = Model(inputs=input_0, outputs=output_0_reshape)
model = Model(inputs=input_0, outputs=output_0)                              
# end of the MODEL

sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 800)          0                                            
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 1, 800, 1)    0           input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 1, 800, 2)    8           reshape_1[0][0]                  
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 1, 800, 2)    12          reshape_1[0][0]                  
__________________________________________________________________________________________________
conv2d_3 (



## Model
- <font color=blue>CNN for 2 inputs</font>

In [None]:
dimension_0 = 800
dimension_1 = 800
cutoff = 706

import tensorflow as tf
import time
import matplotlib.pyplot as plt
% matplotlib inline
np.random.seed(2017) 
from keras.models import Sequential, Model
from keras.layers.convolutional import Convolution2D, MaxPooling2D, Convolution1D, MaxPooling1D, AveragePooling2D
from keras.layers import Activation, Flatten, Dense, Dropout, Reshape, Embedding, Input
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.optimizers import SGD
import numpy as np
import keras
from keras.models import Model, load_model
from keras.optimizers import Adam, SGD, RMSprop
# Visualization
from keras.utils import plot_model

input_shape_0 = (dimension_0,)
input_shape_1 = (dimension_1,)

input_0 = Input(shape=input_shape_0, dtype='float32')
input_0_reshape = Reshape((1,dimension_0,1), input_shape=input_shape_0)(input_0)
conv2d_3_0 = Convolution2D(2, 1, 3, border_mode='same')(input_0_reshape)
conv2d_5_0 = Convolution2D(2, 1, 5, border_mode='same')(input_0_reshape)
conv2d_7_0 = Convolution2D(2, 1, 7, border_mode='same')(input_0_reshape)

input_1 = Input(shape=input_shape_1, dtype='float32')
input_1_reshape = Reshape((1,dimension_1,1), input_shape=input_shape_1)(input_1)
conv2d_3_1 = Convolution2D(2, 1, 3, border_mode='same')(input_1_reshape)
conv2d_5_1 = Convolution2D(2, 1, 5, border_mode='same')(input_1_reshape)
conv2d_7_1 = Convolution2D(2, 1, 7, border_mode='same')(input_1_reshape)

x_0 = keras.layers.concatenate([conv2d_3_0,conv2d_5_0,conv2d_7_0])
x_0 = Activation('relu')(x_0)
x_0 = Flatten()(x_0)

x_1 = keras.layers.concatenate([conv2d_3_1,conv2d_5_1,conv2d_7_1])
x_1 = Activation('relu')(x_1)
x_1 = Flatten()(x_1)

x = keras.layers.concatenate([x_0,x_1])

x = Dense(cutoff, activation='relu')(x)
output_0 = Dense(cutoff, activation='softmax')(x)
#output_0_reshape = Reshape((cutoff,1), input_shape=(cutoff,))(output_0)

#model = Model(inputs=input_0, outputs=output_0_reshape)
model = Model(inputs=[input_0, input_1], outputs=output_0)                              
# end of the MODEL

sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

- <font color=blue>Threshold: mean+2.33*std</font>

In [7]:
def threshold_func(y_in):
    factor = 2.33
    y_out = np.zeros_like(y_in)
    for i in range(y_in.shape[0]):
        th= np.mean(y_in[i]) + factor * np.std(y_in[i])
        y_out[i] = (y_in[i] > th)
    return y_out

- <font color=blue>Metric: F1 score</font>

In [8]:
cb = modules_type.F1_history(threshold_func)

model_args = {'model': model, 
              'generators': [train_gen, val_gen], 
              'callbacks': [], 
              'post_train_args': {'user': user, 
                                  'model': model_name, 
                                  'result': output_file, 
                                  'fig_path': fig_path}}

trainer = modules_type.Trainer(**model_args)

Callback initialized.
Assigning validation generator... Done
Matching input shape... Done
Matching output shape... Done
Trainer initialized.


In [9]:
import warnings; 
warnings.simplefilter('ignore')
trainer.start(epoch=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3
[End of Training]


In [10]:
t_x, t_y = train_gen[0]
l_y = model.predict(t_x)

In [11]:
print (l_y[:5], '\n',t_y[:5])

[[6.09167827e-09 9.09513442e-09 3.66510839e-08 1.10804317e-08
  1.52015307e-08 5.09445963e-04 1.36015657e-07 2.47830786e-02
  8.45331885e-03 2.28189947e-07 5.88219464e-08 3.87029053e-04
  9.65866685e-01]
 [2.41992439e-07 1.09250792e-07 3.76685598e-08 2.91896232e-08
  5.25051364e-05 1.01996772e-02 5.99686629e-08 9.69351947e-01
  1.83554087e-02 2.00836174e-03 3.03060084e-08 3.17130180e-05
  8.05222644e-09]
 [3.68735371e-08 2.74345990e-09 1.73538570e-08 4.22703970e-08
  9.95452344e-01 9.44792419e-07 3.61924735e-09 4.53086477e-03
  1.46561997e-05 1.47053981e-07 1.30827347e-14 8.00327655e-07
  1.31141221e-07]
 [4.09762855e-08 1.11462235e-07 1.28745319e-06 9.55433137e-08
  9.00949362e-06 1.73812467e-07 5.18801535e-08 5.45893215e-07
  6.39968434e-09 5.56822750e-04 1.23840789e-06 6.79595047e-08
  9.99430478e-01]
 [2.28100112e-06 1.11327324e-06 3.74978708e-06 2.16658532e-06
  1.32206731e-06 3.24144028e-02 5.60346825e-06 4.23243403e-01
  1.89379424e-01 8.87193065e-03 5.64841294e-08 3.45241129e-0

In [17]:
label_dict

{'3CO': 8,
 '3NI': 11,
 'CA': 1,
 'CO': 7,
 'CU': 0,
 'CU1': 12,
 'FE': 5,
 'FE2': 10,
 'MG': 4,
 'MN': 3,
 'MN3': 9,
 'NI': 6,
 'ZN': 2}