In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB


np.random.seed(42)

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, LSTM
from keras import utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping


 
%matplotlib inline

In [2]:
trainmain_df = pd.read_csv('./data3/train_data_bagged.csv')
testmain_df = pd.read_csv('./data3/test_data_bagged.csv')

In [3]:
trainmain_df.rename({'0':'sequence','label':'label'},axis=1,inplace=True)

In [4]:
trainmain_df.columns

Index(['sequence', 'label'], dtype='object')

In [5]:
X = trainmain_df['sequence'].values
y = trainmain_df['label'].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [7]:
X_train.shape

(18080,)

In [8]:
vocab_size = 256
#embedding_dim = 64
max_length = 128
trunc_type = 'post'
padding_type = 'post'

In [9]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)

In [26]:
print(tokenizer.word_counts)

OrderedDict([('taac', 20135), ('aaca', 36949), ('acat', 24968), ('catt', 30500), ('attt', 66115), ('tttg', 42523), ('ttgt', 33439), ('tgtg', 26526), ('gtgt', 21895), ('tgtt', 34663), ('gttt', 37178), ('tttt', 81761), ('ttta', 51826), ('ttaa', 48205), ('taat', 37860), ('aatt', 56673), ('attg', 29852), ('tgtc', 14889), ('gtca', 15501), ('tcat', 20009), ('catg', 13692), ('atgc', 18411), ('tgcc', 19351), ('gccc', 12202), ('cccc', 9165), ('ccct', 9608), ('cctt', 14110), ('cttt', 30882), ('ttag', 18982), ('tagt', 18093), ('agtt', 29511), ('ttac', 19692), ('tacg', 11206), ('acgc', 13473), ('cgct', 16741), ('gcta', 14939), ('ctag', 9564), ('taga', 15509), ('agaa', 29142), ('gaaa', 43554), ('aaac', 39579), ('aact', 27259), ('actt', 25623), ('ctta', 18837), ('tagc', 14779), ('agct', 22629), ('ctac', 11078), ('tact', 16794), ('cttc', 16654), ('ttcc', 21251), ('tcct', 13490), ('ccta', 9463), ('tagg', 8619), ('aggt', 11820), ('ggtc', 10114), ('gtcg', 13625), ('tcga', 24610), ('cgag', 15051), ('gaga

In [10]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [11]:
train_padded = pad_sequences(X_train, padding=padding_type, truncating=trunc_type)

In [12]:
test_padded = pad_sequences(X_test, padding=padding_type, truncating=trunc_type)

In [13]:
early_stop = EarlyStopping(monitor='val_loss', patience= 10, min_delta= 0.01) 

In [14]:
from keras import regularizers
from keras.layers import Bidirectional

In [18]:
model = Sequential()

# input layer
model.add(Dense(128,  activation='relu'))
#model.add(Dropout(0.05))

# hidden layer
model.add(Dense(64, activation='relu'))  #,kernel_regularizer= regularizers.l2(0.05)
#model.add(Dropout(0.05))

model.add(Dense(64, activation='relu'))  #,kernel_regularizer= regularizers.l2(0.05)
#model.add(Dropout(0.05))

model.add(Dense(64, activation='relu'))  #,kernel_regularizer= regularizers.l2(0.05)
#model.add(Dropout(0.05))

model.add(Dense(64, activation='relu'))  #,kernel_regularizer= regularizers.l2(0.05)
#model.add(Dropout(0.05))


#model.add(layers.Conv1D(128, 5, activation='relu'))
#model.add(layers.GlobalMaxPooling1D())


# hidden layer
model.add(Dense(32, activation='relu'))

# output layer
model.add(Dense(units=1,activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', metrics=['acc'])

In [21]:
model.fit(train_padded, y_train, validation_data=(test_padded,y_test), epochs=100, batch_size=20, callbacks=[early_stop])

##batch size = 128
##, callbacks=[early_stop]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


<tensorflow.python.keras.callbacks.History at 0x7f8b35330d90>