In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB


np.random.seed(42)

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, LSTM
from keras import utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping


 
%matplotlib inline

In [2]:
trainmain_df = pd.read_csv('./data3/train_data_bagged.csv')
testmain_df = pd.read_csv('./data3/test_data_bagged.csv')

In [3]:
trainmain_df.rename({'0':'sequence','label':'label'},axis=1,inplace=True)

In [72]:
trainmain_df.columns

Index(['sequence', 'label'], dtype='object')

In [103]:
X = trainmain_df['sequence'].values
y = trainmain_df['label'].values

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=100)

In [105]:
X_train.shape

(18080,)

In [106]:
vocab_size = 300
#embedding_dim = 128
max_length = 128
trunc_type = 'post'
padding_type = 'post'

In [107]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)

In [108]:
print(tokenizer.word_counts)

OrderedDict([('act', 76841), ('ctt', 84077), ('ttg', 112078), ('tgg', 77453), ('ggc', 61752), ('gct', 77710), ('ttc', 101474), ('tca', 83023), ('cac', 71125), ('acc', 54646), ('ccc', 44074), ('ccg', 51456), ('cga', 75328), ('gaa', 107742), ('aaa', 237871), ('aag', 97035), ('agg', 47810), ('gga', 62020), ('gac', 49072), ('aca', 99267), ('cat', 82651), ('atc', 81618), ('cag', 79056), ('agc', 82261), ('gca', 89744), ('acg', 53222), ('cgc', 64536), ('gcc', 65381), ('cgg', 50786), ('caa', 120283), ('aat', 165373), ('ata', 131144), ('tat', 126483), ('atg', 80344), ('tga', 82191), ('cct', 46955), ('cta', 57555), ('tac', 62370), ('cca', 81140), ('taa', 135813), ('aac', 101523), ('gat', 79158), ('agt', 86506), ('gta', 63864), ('cgt', 52497), ('ggt', 52181), ('gtt', 97290), ('tta', 128840), ('ctc', 57184), ('tcg', 74853), ('tag', 57028), ('gtg', 78076), ('ggg', 38009), ('gag', 62303), ('aga', 78809), ('tgc', 88205), ('gcg', 63802), ('att', 160740), ('ttt', 216888), ('ctg', 75790), ('tgt', 98435)

In [92]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [93]:
train_padded = pad_sequences(X_train, padding=padding_type, truncating=trunc_type)

In [94]:
test_padded = pad_sequences(X_test, padding=padding_type, truncating=trunc_type)

In [95]:
early_stop = EarlyStopping(monitor='val_loss', patience= 10, min_delta= 0.01) 

In [96]:
from keras import regularizers
from keras.layers import Bidirectional

In [97]:
len(train_padded)

18080

In [98]:
model = Sequential()

# input layer
#model.add(Embedding(vocab_size,embedding_dim,input_length = max_length))


model.add(Dense(3000,  activation='relu'))
model.add(Dropout(0.04))

# hidden layer
model.add(Dense(1500, activation='relu',kernel_regularizer= regularizers.l2(0.02)))  #
model.add(Dropout(0.04))

model.add(Dense(1500, activation='relu',kernel_regularizer= regularizers.l2(0.02)))  #,kernel_regularizer= regularizers.l2(0.05)
model.add(Dropout(0.04))

#model.add(layers.Conv1D(128, 5, activation='relu'))
#model.add(layers.GlobalMaxPooling1D())


# hidden layer
model.add(Dense(1500, activation='relu',kernel_regularizer= regularizers.l2(0.4)))
model.add(Dropout(0.04))


# output layer
model.add(Dense(units=1,activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', metrics=['acc'])

In [99]:
history = model.fit(train_padded, y_train, validation_data=(test_padded,y_test), epochs=100, batch_size=100, callbacks=[early_stop])

##batch size = 128
##, callbacks=[early_stop]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
 10/181 [>.............................] - ETA: 35s - loss: 0.9485 - acc: 0.5211

KeyboardInterrupt: 