## Preprocessing

In [1]:
user = 'PART1'
model_name = 'TYPE'

In [39]:
import sys
import json

print ("Initializing global variables...", end=' ')
sys.stdout.flush()

# Filepaths
output_file = './logs/results.txt'
hist_path = model_path = fig_path = './logs/'
dict_path = './dictionaries/'

print ("Done")
print ("  Filepath set to ./logs/")

##################################################

print ("Importing modules...", end=' ')
import modules_type
print ("Done")

##################################################

print ("Reading data from disk...", end=' ')
sys.stdout.flush()

import numpy as np
import pandas as pd

df = pd.read_parquet('./Metal_all_20180601.parquet')
seqs = np.array(df.sequence)
target = np.array(df.ligandId)
cluster_numbers = np.array(df.clusterNumber90)

print (set(target))

label_dict ={}
for i, j in enumerate(set(df.ligandId)):
    label_dict[j] = i

for i in range(target.shape[0]):
    target[i] = [label_dict[target[i]]]
    

print ("Done")

##################################################

print ("Loading dictionaries...", end=' ')
sys.stdout.flush()

# FOFE
vocab_dic_fofe = {}
with open(dict_path + "vocab_dict_fofe", 'r') as fp:
        vocab_dic_fofe = json.load(fp)

print ("Done")

##################################################

print ("Performing cross validation split...", end=' ')
ratio = 0.9
split = int(ratio*len(seqs))
train_seqs, val_seqs = seqs[:split], seqs[split:]
train_label, val_label = target[:split], target[split:]
print ("Done")
print ("  Ratio :", ratio)
print ("  Train_range :", 0, "-", split-1)
print ("  Val_range :", split, "-", len(seqs)-1)

Initializing global variables... Done
  Filepath set to ./logs/
Importing modules... Done
Reading data from disk... {'NI', 'CU', 'ZN', 'FE', 'CO', 'CA', 'MG', 'MN'}
Done
Loading dictionaries... Done
Performing cross validation split... Done
  Ratio : 0.9
  Train_range : 0 - 52385
  Val_range : 52386 - 58206


In [3]:
df.groupby('ligandId').count()

Unnamed: 0_level_0,structureChainId,fingerprint,groupNumber,sequence,interactingChains,clusterNumber30,clusterNumber40,clusterNumber50,clusterNumber70,clusterNumber90,clusterNumber95,clusterNumber100
ligandId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CA,17606,17606,17606,17606,17606,17606,17606,17606,17606,17606,17606,17606
CO,759,759,759,759,759,759,759,759,759,759,759,759
CU,2143,2143,2143,2143,2143,2143,2143,2143,2143,2143,2143,2143
FE,2989,2989,2989,2989,2989,2989,2989,2989,2989,2989,2989,2989
MG,8336,8336,8336,8336,8336,8336,8336,8336,8336,8336,8336,8336
MN,4688,4688,4688,4688,4688,4688,4688,4688,4688,4688,4688,4688
NI,1073,1073,1073,1073,1073,1073,1073,1073,1073,1073,1073,1073
ZN,20613,20613,20613,20613,20613,20613,20613,20613,20613,20613,20613,20613


## Data Generator

- <font color=blue>FOFE Encoding</font>

In [4]:
train_args = {'sequences': train_seqs,
              'labels': train_label,
              'translator': vocab_dic_fofe}
val_args = {'sequences': val_seqs,
            'labels': val_label,
            'translator': vocab_dic_fofe}
common_args = {'batch_size': 100,
               'input_shape': (800,),
               'label_shape': (13, ),
               'shuffle': True}

train_gen = modules_type.FOFEGenerator(**train_args, **common_args)
val_gen = modules_type.FOFEGenerator(**val_args, **common_args)

## Model
- <font color=blue>CNN</font>

In [5]:
# ProtVec:100, One-hot:20, blosum62:20, property:7
dimension = 800
cutoff = 13

import tensorflow as tf
import time
import matplotlib.pyplot as plt
% matplotlib inline
np.random.seed(2017) 
from keras.models import Sequential, Model
from keras.layers.convolutional import Convolution2D, MaxPooling2D, Convolution1D, MaxPooling1D, AveragePooling2D
from keras.layers import Activation, Flatten, Dense, Dropout, Reshape, Embedding, Input
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.optimizers import SGD
import numpy as np
import keras
from keras.models import Model, load_model
from keras.optimizers import Adam, SGD, RMSprop
# Visualization
from keras.utils import plot_model

input_shape = (dimension,)

input_0 = Input(shape=input_shape, dtype='float32')
input_0_reshape = Reshape((1,dimension,1), input_shape=(dimension,))(input_0)
conv2d_3 = Convolution2D(2, 1, 3, border_mode='same')(input_0_reshape)
conv2d_5 = Convolution2D(2, 1, 5, border_mode='same')(input_0_reshape)
conv2d_7 = Convolution2D(2, 1, 7, border_mode='same')(input_0_reshape)

x = keras.layers.concatenate([conv2d_3,conv2d_5,conv2d_7])
x = Activation('relu')(x)
x = Flatten()(x)
x = Dense(cutoff, activation='relu')(x)
output_0 = Dense(cutoff, activation='softmax')(x)
#output_0_reshape = Reshape((cutoff,1), input_shape=(cutoff,))(output_0)

#model = Model(inputs=input_0, outputs=output_0_reshape)
model = Model(inputs=input_0, outputs=output_0)                              
# end of the MODEL

sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])

# model.summary()



In [6]:
model_args = {'model': model, 
              'generators': [train_gen, val_gen], 
              'callbacks': [], 
              'post_train_args': {'user': user, 
                                  'model': model_name, 
                                  'result': output_file, 
                                  'fig_path': fig_path}}

trainer = modules_type.Trainer(**model_args)

Assigning validation generator... Done
Matching input shape... Done
Matching output shape... Done
Trainer initialized.


In [None]:
import warnings; 
warnings.simplefilter('ignore')
trainer.start(epoch=15)

In [8]:
# serialize model to JSON
model_json = model.to_json()
with open("type.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("type.h5")
print("Saved model to disk")

Saved model to disk


In [12]:
# later...
from keras.models import model_from_json
# load json and create model
json_file = open('type.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("type.h5")
print("Loaded model from disk")

Loaded model from disk


In [21]:
train_args = {'sequences': seqs,
              'labels': target,
              'translator': vocab_dic_fofe}
common_args = {'batch_size': 1,
               'input_shape': (800,),
               'label_shape': (13, ),
               'shuffle': False}
train_gen = modules_type.FOFEGenerator(**train_args, **common_args)

In [22]:
metal_predictions = []
Y = []
for i in range(len(train_gen)):
    x,y = train_gen[i]
    metal_predictions.append(model.predict(x))
    Y.append(y)
    

In [23]:
inv_label_dict = {v: k for k, v in label_dict.items()}
l1 = []
l2 = []
for i,j in enumerate(Y):
    max_i = max(j[0])
    x = [a for a, b in enumerate(Y[i][0]) if b == max_i]
    l2.append(inv_label_dict[x[0]])
    
for i,j in enumerate(metal_predictions):
    max_i = max(j[0])
    x = [a for a, b in enumerate(metal_predictions[i][0]) if b == max_i]
    l1.append(inv_label_dict[x[0]])

In [24]:
c = 0
for i in range(len(l1)):
    if l1[i] != l2[i]:
        c+=1
print ((len(l1)-c) / len(l1))

0.8708746370711427


In [40]:
df['metalPrediction'] = np.array(l1, dtype='O')

In [38]:
df['metalPrediction'].dtype

dtype('O')

In [32]:
df

Unnamed: 0,structureChainId,ligandId,fingerprint,groupNumber,sequence,interactingChains,clusterNumber30,clusterNumber40,clusterNumber50,clusterNumber70,clusterNumber90,clusterNumber95,clusterNumber100,metalPrediction
0,1A0O.A,MN,"[11, 55, 57]","[13, 57, 59]",ADKELKFLVVDDFSTMRRIVRNLLKELGFNNVEEAEDGVDALNKLQ...,1,115.0,386.0,404.0,371.0,325.0,313.0,1621.0,MN
1,1A25.A,CA,"[107, 110, 111, 113]","[248, 251, 252, 254]",GSPGISGGGGGILDSMERRGRIYIQAHIDREVLIVVVRDAKNLVPM...,2,632.0,5212.0,24455.0,28660.0,32138.0,33630.0,44596.0,CA
2,1A25.A,CA,"[45, 46, 105, 107, 113]","[186, 187, 246, 248, 254]",GSPGISGGGGGILDSMERRGRIYIQAHIDREVLIVVVRDAKNLVPM...,2,632.0,5212.0,24455.0,28660.0,32138.0,33630.0,44596.0,CA
3,1A25.A,CA,"[46, 52, 105, 106, 107]","[187, 193, 246, 247, 248]",GSPGISGGGGGILDSMERRGRIYIQAHIDREVLIVVVRDAKNLVPM...,1,632.0,5212.0,24455.0,28660.0,32138.0,33630.0,44596.0,CA
4,1AJD.B,MG,"[50, 154, 321]","[51, 155, 322]",TPEMPVLENRAAQGDITAPGGARRLTGDQTAALRDSLSDKPAKNII...,1,584.0,592.0,555.0,515.0,485.0,444.0,8044.0,CO
7,1ALN.A,ZN,"[101, 128, 131]","[102, 129, 132]",MHPRFQTAFAQLADNLQSALEPILADKYFPALLTGEQVSSLKSATG...,1,4526.0,4989.0,13948.0,15950.0,17149.0,17499.0,19782.0,ZN
8,1AS6.C,CU,"[97, 138, 147, 152]","[95, 136, 145, 150]",QGAVRKATAAEIAALPRQKVELVDPPFVHAHSQVAEGGPKVVEFTM...,1,181.0,198.0,182.0,158.0,305.0,292.0,548.0,CU
9,1AVM.B,FE,"[26, 74, 160, 164]","[27, 75, 161, 165]",AVYTLPELPYDYSALEPYISGEIMELHHDKHHKAYVDGANTALDKL...,1,131.0,129.0,145.0,5796.0,5939.0,5934.0,5230.0,FE
10,1BWV.E,MG,"[209, 211, 212]","[201, 203, 204]",MSQSIEEKSVQERTRIKNSRYESGVIPYAKMGYWNPDYQVKDTDVL...,1,92.0,124.0,131.0,2225.0,4400.0,4370.0,4635.0,ZN
11,1CG2.A,ZN,"[89, 118, 177]","[112, 141, 200]",ALAQKRDNVLFQAATDEQPAVIKTLEKLVNIETGTGDAEGIAAAGN...,1,10402.0,12008.0,13323.0,15154.0,16224.0,16540.0,18316.0,ZN


In [45]:
df.to_json('Metal_all_20180601_predicted.parquet', orient='index')

In [46]:
df2 = pd.read_json('Metal_all_20180601_predicted.parquet', orient='index')

In [47]:
df2

Unnamed: 0,clusterNumber100,clusterNumber30,clusterNumber40,clusterNumber50,clusterNumber70,clusterNumber90,clusterNumber95,fingerprint,groupNumber,interactingChains,ligandId,metalPrediction,sequence,structureChainId
0,1621,115,386,404,371,325,313,"[11, 55, 57]","[13, 57, 59]",1,MN,MN,ADKELKFLVVDDFSTMRRIVRNLLKELGFNNVEEAEDGVDALNKLQ...,1A0O.A
1,44596,632,5212,24455,28660,32138,33630,"[107, 110, 111, 113]","[248, 251, 252, 254]",2,CA,CA,GSPGISGGGGGILDSMERRGRIYIQAHIDREVLIVVVRDAKNLVPM...,1A25.A
10,4635,92,124,131,2225,4400,4370,"[209, 211, 212]","[201, 203, 204]",1,MG,ZN,MSQSIEEKSVQERTRIKNSRYESGVIPYAKMGYWNPDYQVKDTDVL...,1BWV.E
10000,3254,2681,2872,2922,2954,2988,2944,"[233, 237, 260]","[234, 238, 261]",1,MG,MG,MSLNITGIQSDWKVEKIEFAKLTGERARSAGANGRIGVHGKSCTVD...,3OPS.A
10001,938,44,32,25,1322,1291,1260,"[83, 86, 88]","[84, 87, 89]",1,MG,MG,SNANNLVPTVIEKTAGGERAFDIYSRLLKERIVFLNGEVNDHSANL...,3P2L.C
10002,187,197,185,169,142,152,147,"[9, 11, 13, 18]","[10, 12, 14, 19]",1,CA,CA,ADTIVAVELDTYPNTDIGDPSYPHIGIDIKSVRSKKTAKWNMQNGK...,3QLQ.B
10004,28289,13708,16027,17984,20783,22764,23547,"[11, 13, 151]","[12, 14, 152]",1,MG,MG,MSLSEIKHWVFDMDGTLTIAVHDFAAIREALSIPAEDDILTHLAAL...,3R09.A
10005,37,6,4,14,21,34,33,"[51, 53, 56, 61]","[70, 72, 75, 80]",1,CA,CA,IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGI...,3RXQ.A
10006,2758,1470,1499,1489,1913,3471,3416,"[255, 258, 266, 272, 323]","[256, 259, 267, 273, 324]",1,CA,CA,MSDKDSKNTPQVPEKLGLSRRGFLGASAVTGAAVAATALGGAVMTR...,3SBR.D
10007,4429,130,122,100,3632,3685,3634,"[52, 84, 86, 88]","[158, 190, 192, 194]",1,CA,CA,NPRWEQTHLTYRIENYTPDLPRADVDHAIEKAFQLWSNVTPLTFTK...,3SHI.G


In [26]:
print (l1[:10])

['MN', 'CA', 'CA', 'CA', 'CO', 'ZN', 'CU', 'FE', 'ZN', 'ZN']


In [27]:
print (l2[:10])

['MN', 'CA', 'CA', 'CA', 'MG', 'ZN', 'CU', 'FE', 'MG', 'ZN']


In [None]:
# Remove seqs containing 'U' and 'X'

duplicate_dict = {}
rows_to_delete = []
for i in range(seqs.shape[0]):
    if 'X' in seqs[i] \
    or 'U' in seqs[i] \
    or '3CO' in target[i]\
    or '3NI' in target[i] \
    or 'FE2'in target[i] \
    or 'CU1'in target[i]\
    or 'MN3' in target[i] \
    or np.isnan(cluster_numbers[i]):
        rows_to_delete.append(i)
        print (i, end=',')
    elif seqs[i] not in duplicate_dict.keys():
        duplicate_dict[seqs[i]] = target[i]
    else:
        if target[i] != duplicate_dict[seqs[i]]:
            rows_to_delete.append(i)
            print (i, end=',')
    
# df = df.drop(df.index[rows_to_delete])
# df.to_parquet('Metal_all_20180601.parquet')
seqs = np.delete(seqs, rows_to_delete, 0)
target = np.delete(target, rows_to_delete)
cluster_numbers = np.delete(cluster_numbers, rows_to_delete)