In [4]:
import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Permute, Dense, SeparableConv1D, Flatten, Dropout, SpatialDropout1D

In [5]:
n = 64
model = Sequential()
model.add(SeparableConv1D(n, 3, activation='relu', input_shape=(2000,31)))
model.add(SeparableConv1D(n, 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(SpatialDropout1D(0.2))

n = 128
model.add(SeparableConv1D(n, 3, activation='relu'))
model.add(SeparableConv1D(n, 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(SpatialDropout1D(0.2))

n = 256
model.add(SeparableConv1D(n, 3, activation='relu'))
model.add(SeparableConv1D(n, 3, activation='relu'))
model.add(MaxPooling1D(4))
model.add(SpatialDropout1D(0.2))

n = 512
model.add(SeparableConv1D(n, 3, activation='relu'))
model.add(SeparableConv1D(n, 3, activation='relu'))
model.add(MaxPooling1D(4))
model.add(SpatialDropout1D(0.2))


n = 1024
model.add(SeparableConv1D(n, 3, activation='relu'))
model.add(MaxPooling1D(8))
model.add(SpatialDropout1D(0.2))

model.add(Flatten())

model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(25, activation='softmax'))

In [6]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
separable_conv1d_10 (Separab (None, 1998, 64)          2141      
_________________________________________________________________
separable_conv1d_11 (Separab (None, 1996, 64)          4352      
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 998, 64)           0         
_________________________________________________________________
spatial_dropout1d_6 (Spatial (None, 998, 64)           0         
_________________________________________________________________
separable_conv1d_12 (Separab (None, 996, 128)          8512      
_________________________________________________________________
separable_conv1d_13 (Separab (None, 994, 128)          16896     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 497, 128)          0         
__________

In [7]:
from seq2mat import DataGenerator, seq_class

In [8]:
seq_class.head()

Unnamed: 0_level_0,sequence,classification,label
structureId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
177L,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,HYDROLASE,15
178L,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,HYDROLASE,15
1A0I,VNIKTNPFKAVSFVESAIKKALDNAGYLIAEIKYDGVRGNICVDNT...,LIGASE,6
1A16,SEISRQEFQRRRQALVEQMQPGSAALIFAAPEVTRSADSEYPYRQN...,HYDROLASE/HYDROLASE INHIBITOR,20
1A17,RDEPPADGALKRAEELKTQANDYFKAKDYENAIKFYSQAIELNPSN...,HYDROLASE,15


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_val, test, y_train, y_test = train_test_split(seq_class.index, seq_class['label'], test_size=0.1)

In [11]:
train_val.shape

(34155,)

In [12]:
test.shape

(3796,)

In [13]:
test[0]

'3VHR'

In [14]:
train, validation = train_test_split(train_val, test_size=0.2)

In [15]:
train.shape

(27324,)

In [16]:
y_train

structureId
5UFT    13
2ML1    24
1RZ5    16
4P8B    12
1IEE    15
        ..
1W06    16
4XN8    15
3X31     5
3D2I    22
2W7T     6
Name: label, Length: 34155, dtype: int64

In [17]:
params = {
    'batch_size': 32, 
    'dim': (2000, 31), 
    'n_channels': 1,
    'n_classes': 25, 
    'shuffle': True
}

training_generator = DataGenerator(train, seq_class['label'], **params)
validation_generator = DataGenerator(validation, seq_class['label'], **params)
train_val_generator = DataGenerator(train_val, seq_class['label'], **params)

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', 
              metrics=['accuracy'])





In [19]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [20]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weights = dict(enumerate(class_weights))
print(class_weights)
print(seq_class[seq_class['classification']=='HYDROLASE'])

{0: 2.737875751503006, 1: 1.1900696864111497, 2: 3.1698375870069606, 3: 2.6374517374517374, 4: 2.710714285714286, 5: 1.2058252427184466, 6: 1.8638472032742155, 7: 3.0563758389261744, 8: 1.918820224719101, 9: 2.1929373996789727, 10: 3.1772093023255814, 11: 2.042152466367713, 12: 1.2165627782724844, 13: 2.151496062992126, 14: 2.1685714285714286, 15: 0.1537301676606279, 16: 0.34569838056680163, 17: 3.4155, 18: 1.2408719346049046, 19: 3.3733333333333335, 20: 1.6361676646706587, 21: 2.18592, 22: 0.24532231998563475, 23: 1.0334341906202724, 24: 1.658009708737864}
                                                      sequence classification  \
structureId                                                                     
177L         MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...      HYDROLASE   
178L         MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...      HYDROLASE   
1A17         RDEPPADGALKRAEELKTQANDYFKAKDYENAIKFYSQAIELNPSN...      HYDROLASE   
1A2Q         AQSVPYGVSQIKAPALHS

In [None]:
from keras.callbacks import ModelCheckpoint   

# train the model
checkpointer = ModelCheckpoint(filepath='prot_6.model.best.hdf5', 
                               verbose=1, save_best_only=True)
hist = model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=5,
                    callbacks = [checkpointer],
                    verbose=1,
                    use_multiprocessing=True,
                    workers=4,
                    class_weight=class_weights)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/5

In [15]:
from models import prot_3

prot_3.compile(loss='categorical_crossentropy', optimizer='rmsprop', 
              metrics=['accuracy'])
prot_3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 1998, 1, 64)       6016      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 499, 1, 64)        0         
_________________________________________________________________
permute_1 (Permute)          (None, 499, 64, 1)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 497, 1, 128)       24704     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 124, 1, 128)       0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 124, 1, 128)       0         
_________________________________________________________________
permute_2 (Permute)          (None, 124, 128, 1)       0         
__________

In [18]:
from keras.callbacks import ModelCheckpoint   

# train the model
checkpointer = ModelCheckpoint(filepath='prot_3.model.best.hdf5', 
                               verbose=1, save_best_only=True)
hist = prot_3.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=5,
                    callbacks = [checkpointer],
                    verbose=1,
                    use_multiprocessing=True,
                    workers=4)

Epoch 1/5
199/853 [=====>........................] - ETA: 8:08 - loss: 2.6362 - acc: 0.2888

KeyboardInterrupt: 

In [1]:
from seq2mat import seq_class

In [3]:
import numpy as np

In [4]:
indexes = np.argsort(seq_class['sequence'].str.len())

In [6]:
index = indexes[10:50]

In [9]:
seq_class['sequence'][index].str.len().max()

12