### Preprocessing

In [None]:
user = 'userName'
metal = 'ZN'
model_name = metal+"_Net"

In [None]:
import sys

print ("Initializing global variables...", end=' ')
sys.stdout.flush()

# shared result file
output_file = './logs/results.txt'
hist_path = model_path = fig_path = './logs/'
dict_path = './dictionaries/'

print ("Done")
print ("  Filepath set to ./logs/")


##################################################

print ("Importing modules...", end=' ')
import modules
print ("Done")

##################################################

print ("Reading data from disk...", end=' ')
sys.stdout.flush()

import numpy as np
import pandas as pd

df = pd.read_parquet('./datasets/Metal_all_20180601_predicted.parquet')


# Extract the metal
df_metal = df.loc[df['metalPrediction'] == metal]

seqs = np.array(df_metal.sequence)
target = np.array(df_metal.fingerprint)
cluster_numbers = np.array(df_metal.clusterNumber90)

print ("Done [" + metal + "]")

##################################################

import json
from proteinSequenceEncoder import property_encoder, blosum62_encoder 
AMINO_ACIDS21 = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', \
                 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y']

##################################################

print ("Performing cross validation split...", end=' ')
ratio = 0.9
split = int(ratio*len(seqs))
train_seqs, val_seqs = seqs[:split], seqs[split:]
train_label, val_label = target[:split], target[split:]
train_cluster, val_cluster = cluster_numbers[:split], cluster_numbers[split:]
print ("Done")
print ("  Ratio :", ratio)
print ("  Train_range :", 0, "-", split-1)
print ("  Val_range :", split, "-", len(seqs)-1)

### Data Generator
Choose one encoding method and adjust input/output shape to match the model

- <font color=blue>FOFE Encoding</font>

In [None]:
vocab_dic_fofe = {}
with open(dict_path + "vocab_dict_fofe", 'r') as fp:
        vocab_dic_fofe = json.load(fp)

train_args = {'sequences': train_seqs,
              'labels': train_label,
              'translator': vocab_dic_fofe}
val_args = {'sequences': val_seqs,
            'labels': val_label,
            'translator': vocab_dic_fofe}
common_args = {'batch_size': 100,
               'input_shape': (800,),
               'label_shape': (706, ),
               'shuffle': True}

train_gen = modules.FOFEGenerator(**train_args, **common_args)
val_gen = modules.FOFEGenerator(**val_args, **common_args)

- <font color=blue>One-hot Encoding</font>

- <font color=blue>ProtVec Encoding</font>

- <font color=blue>Property Encoding</font>

- <font color=blue>Blosum62 Encoding</font>

### Model
- <font color=blue>CNN</font>

In [None]:
# ProtVec:100, One-hot:20, blosum62:20, property:7
dimension = 800
cutoff = 706

import tensorflow as tf
import time
import matplotlib.pyplot as plt
% matplotlib inline
np.random.seed(2017) 
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv2D, MaxPooling2D, Convolution1D, MaxPooling1D, AveragePooling2D
from keras.layers import Activation, Flatten, Dense, Dropout, Reshape, Embedding, Input
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.optimizers import SGD
import numpy as np
import keras
from keras.models import Model, load_model
from keras.optimizers import Adam, SGD, RMSprop
# Visualization
from keras.utils import plot_model

input_shape = (dimension,)

input_0 = Input(shape=input_shape, dtype='float32')
input_0_reshape = Reshape((1,dimension,1), input_shape=(dimension,))(input_0)
conv2d_3 = Conv2D(8, (1, 3), padding='same')(input_0_reshape)
conv2d_5 = Conv2D(8, (1, 5), padding='same')(input_0_reshape)
conv2d_7 = Conv2D(8, (1, 7), padding='same')(input_0_reshape)

x = keras.layers.concatenate([conv2d_3,conv2d_5,conv2d_7])
x = Activation('relu')(x)
x = Flatten()(x)
x = Dense(cutoff, activation='relu')(x)
output_0 = Dense(cutoff, activation='softmax')(x)

model = Model(inputs=input_0, outputs=output_0)                              
# end of the MODEL

sgd = SGD(lr = 0.08, momentum = 0.9, decay = 0, nesterov = False)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])

# model.summary()

- <font color=blue>Threshold: mean + factor * std</font>

In [None]:
factor = 2.33
def threshold_func(y_in):
    factor = 2.33
    y_out = np.zeros_like(y_in)
    for i in range(y_in.shape[0]):
        th= np.mean(y_in[i]) + factor * np.std(y_in[i])
        y_out[i] = (y_in[i] > th)
    return y_out

- <font color=blue>Metric: F1 score</font>

In [None]:
cb = modules.F1_history(threshold_func)

model_args = {'model': model, 
              'generators': [train_gen, val_gen], 
              'callbacks': [cb], 
              'post_train_args': {'user': user, 
                                  'model': model_name, 
                                  'result': output_file, 
                                  'fig_path': fig_path, 
                                  'optimizer': str(type(model.optimizer)).replace('<class \'keras.optimizers.', '').replace('\'>', ''), 
                                  'optimizer_config' : model.optimizer.get_config(), 
                                  'loss': model.loss, 
                                  'factor': factor}}

trainer = modules.Trainer(**model_args)

In [None]:
import warnings; 
warnings.simplefilter('ignore')

trainer.start(epoch=10)

In [None]:
from keras.models import model_from_json
# serialize model to JSON
model_json = model.to_json()
with open("./models/" +metal + ".json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("./models/" +metal + ".h5")
print("Saved " + metal +" model to disk")