In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D, Dropout, concatenate
from keras.preprocessing import text as keras_text, sequence as keras_seq
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.cross_validation import train_test_split


Using TensorFlow backend.


In [2]:


# define network parameters
max_features = 64
maxlen = 900



In [3]:
pat=pd.read_csv('string.csv',names = ["pattern"])
pat.head(2)
#print(len(df["pattern"][0]))

Unnamed: 0,pattern
0,'CCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHCCCCC...
1,'CCCHHHHHHHHHHHHHHHHHHCHHHHHHHHHHHHHHHHHCHHHHH...


In [4]:
class_file_name=pd.read_csv('id_label.csv',names = ["class", "file_name"])
class_file_name.head(2)

Unnamed: 0,class,file_name
0,1,'1ab3A.secstr'
1,1,'1abvA.secstr'


In [5]:
data=pd.DataFrame()
data['pattern']=pat['pattern']
data['class_no']=class_file_name['class']
data['pattern']=[s[1:-1] for s in data.pattern]

In [6]:
data.tail()

Unnamed: 0,pattern,class_no
546,CCCCEEEEEEEECCCCCCCCCCEEEEEECCCCCEEECCCHHHHHHH...,4
547,CCCCCCCCCCCCCCCCEEEEEECHHHCCCEECCCCCCCCCCCCCEE...,4
548,CEEEEEECCHHHHHHHCCCCCEEEEECCCCCCCCEEEEEEECCCCC...,4
549,CCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHCCEEEEEEEC...,4
550,CCCHHHHHHHHHCCCCCCCCCCCHHHHHHHHHHCCCCCCCCCEEEE...,4


In [7]:
X = data.pattern
Y = data.class_no

In [8]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

In [9]:
list(set(Y_train))
#print(Y_train)

[1, 2, 3, 4]

In [10]:
%%time
list_sentences_train = X_train.values
list_classes = list(set(Y_train))
y = Y_train.values
y_val=[val-1 for val in y]
list_sentences_test = X_test.values

CPU times: user 678 µs, sys: 0 ns, total: 678 µs
Wall time: 686 µs


In [19]:
len(list_sentences_train[0])
#print(y)

457

In [12]:
#print(st)
#print(list_sentences_train.values)

In [23]:
tokenizer = keras_text.Tokenizer(char_level = True)
tokenizer.fit_on_texts(list_sentences_train)
# train data
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
#print(list_tokenized_train[0])
X_t = keras_seq.pad_sequences(list_tokenized_train, maxlen=maxlen)
# test data
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_te = keras_seq.pad_sequences(list_tokenized_test, maxlen=maxlen)


In [24]:
len(X_t[0])

900

In [25]:
from keras.utils import to_categorical

y = to_categorical(y_val)
#test_classes = to_categorical(test_class_list)
print(y)

[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 ...
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]]


In [15]:
def build_model(conv_layers = 2, 
                dilation_rates = [0, 2, 4, 8, 16], 
                embed_size = 256):
    inp = Input(shape=(None, ))
    x = Embedding(input_dim = len(tokenizer.word_counts)+1, 
                  output_dim = embed_size)(inp)
    prefilt_x = Dropout(0.25)(x)
    out_conv = []
    # dilation rate lets us use ngrams and skip grams to process 
    for dilation_rate in dilation_rates:
        x = prefilt_x
        for i in range(2):
            if dilation_rate>0:
                x = Conv1D(16*2**(i), 
                           kernel_size = 3, 
                           dilation_rate = dilation_rate,
                          activation = 'relu',
                          name = 'ngram_{}_cnn_{}'.format(dilation_rate, i)
                          )(x)
            else:
                x = Conv1D(16*2**(i), 
                           kernel_size = 1,
                          activation = 'relu',
                          name = 'word_fcl_{}'.format(i))(x)
        out_conv += [Dropout(0.5)(GlobalMaxPool1D()(x))]
    x = concatenate(out_conv, axis = -1)    
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(4, activation='softmax')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model = build_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    1024        input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, None, 256)    0           embedding_1[0][0]                
__________________________________________________________________________________________________
word_fcl_0 (Conv1D)             (None, None, 16)     4112        dropout_1[0][0]                  
__________________________________________________________________________________________________
ngram_2_cn

In [16]:
from sklearn.model_selection import train_test_split
#any_category_positive = np.sum(y,1)
#print('Distribution of Total Positive Labels (important for validation)')
#print(pd.value_counts(any_category_positive))
X_t_train, X_t_test, y_train, y_test = train_test_split(X_t, y, 
                                                        test_size = 0.2,
                                                       random_state = 2017)
print('Training:', X_t_train.shape)
print('Testing:', X_t_test.shape)
print(y.shape)

Training: (374, 900)
Testing: (94, 900)
(468, 4)


In [18]:
batch_size = 4 # large enough that some other labels come in
epochs = 1

file_path="best_weights.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

callbacks_list = [checkpoint, early] #early
model.fit(X_t_train, y_train, 
          validation_data=(X_t_test, y_test),
          batch_size=batch_size, 
          epochs=epochs, 
          shuffle = True,
          callbacks=callbacks_list)

Train on 374 samples, validate on 94 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.99417, saving model to best_weights.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.99417 to 0.94105, saving model to best_weights.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.94105 to 0.88466, saving model to best_weights.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.88466
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.88466
Epoch 6/10

Epoch 00006: val_loss improved from 0.88466 to 0.83280, saving model to best_weights.h5
Epoch 7/10

Epoch 00007: val_loss improved from 0.83280 to 0.81509, saving model to best_weights.h5
Epoch 8/10

Epoch 00008: val_loss improved from 0.81509 to 0.78729, saving model to best_weights.h5
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.78729
Epoch 10/10

Epoch 00010: val_loss improved from 0.78729 to 0.75230, saving model to best_weights.h5


<keras.callbacks.History at 0x7fcdcfc9d160>