In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# sklearn libraries
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# import keras models
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical
from keras.layers import Bidirectional, BatchNormalization
from keras.callbacks import EarlyStopping 
from keras import regularizers

%matplotlib inline

In [51]:
def getseq(data): 
    all_data = [rows for rows in data[0]]
    return [all_data[i] for i in range(1,len(all_data)) if i%2!=0]

In [52]:
prom_raw = pd.read_csv('./data/PromoterSequence.txt', header=None)
nonprom_raw = pd.read_csv('./data/NonPromoterSequence.txt', header=None)

In [53]:
prom = getseq(prom_raw)
nonprom = getseq(nonprom_raw)

In [54]:
prom_df = pd.DataFrame(prom)
prom_df['label'] = 1
prom_df.rename({0:'sequence','label':'label'},axis=1, inplace=True)

In [55]:
nonprom_df = pd.DataFrame(nonprom)
nonprom_df['label'] = 0
nonprom_df.rename({0:'sequence','label':'label'},axis=1,inplace=True)

In [56]:
#df = pd.concat([prom_df,nonprom_df], axis=0, ignore_index=True)

In [57]:
sequence_promo=[seq for seq in prom_df['sequence']]
len(sequence_promo[0])

301

In [58]:
#[row for row in df.]
words_promo = []
for row in sequence_promo:
    words_promo.append([(row[i:i+4]) for i in range(len(row)-3)])

In [59]:
len(words_promo)

11300

In [60]:
sequence_nonprom=[seq for seq in nonprom_df['sequence']]

In [61]:
words_nonprom = []
for row in sequence_nonprom:
    words_nonprom.append([(row[i:i+4]) for i in range(len(row)-3)])

In [62]:
len(words_nonprom)

11300

In [63]:
for i in range(len(words_promo)):
    words_promo[i] = " ".join(words_promo[i])


In [64]:
for i in range(len(words_nonprom)):
    words_nonprom[i] = " ".join(words_nonprom[i])


In [65]:
df_promo = pd.DataFrame(words_promo)

In [66]:
df_non_promo = pd.DataFrame(words_nonprom)

In [67]:
df_promo.to_csv('promo.csv', index=False, header=False)

In [68]:
df_non_promo.to_csv('non_promo.csv', index=False, header=False)

In [69]:
df_promo['label'] = 1
df_non_promo['label'] = 0

In [70]:
df = pd.concat([df_promo,df_non_promo])

df = df.reset_index(drop=True)
df.rename({0:'seq','label':'label'},axis=1, inplace=True)
df

Unnamed: 0,seq,label
0,TTAA TAAT AATT ATTT TTTG TTGT TGTC GTCC TCCT C...,1
1,ATAG TAGC AGCT GCTC CTCA TCAA CAAA AAAT AATT A...,1
2,AAGC AGCT GCTT CTTC TTCC TCCC CCCT CCTT CTTT T...,1
3,TATG ATGT TGTA GTAG TAGA AGAA GAAT AATC ATCT T...,1
4,ACAT CATA ATAT TATT ATTA TTAC TACT ACTG CTGC T...,1
...,...,...
22595,TGGT GGTA GTAA TAAA AAAA AAAA AAAA AAAT AATT A...,0
22596,AGTG GTGC TGCA GCAA CAAC AACT ACTG CTGG TGGA G...,0
22597,GCAT CATG ATGG TGGA GGAT GATT ATTT TTTC TTCA T...,0
22598,GTGA TGAC GACC ACCA CCAG CAGG AGGT GGTT GTTT T...,0


In [71]:
X = df['seq']
y = df['label']

y.value_counts(normalize=True)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.4,
                                                    stratify=y,
                                                    random_state=42)


In [72]:
X_train.index

Int64Index([15264, 17447,  5776, 19496,  4296,  3583, 12580, 13513, 11797,
            11977,
            ...
            10207,  2929, 22554,  8710, 10021,  6952, 15961,  4601, 17193,
            10303],
           dtype='int64', length=13560)

In [73]:
vocab_size=300
embedding_dim=150
max_length=200
trunc_type='post' 
padding_type='post' 
oov_tok='<OOV>' 

In [74]:
A=[len(X_train[i]) for i in X_train.index]
max(A)

1489

In [75]:
tokenizer=Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

In [76]:
print(tokenizer.word_counts)

OrderedDict([('aaat', 52818), ('aatt', 42392), ('atta', 28123), ('ttag', 14096), ('tagt', 13486), ('agta', 14056), ('gtaa', 16428), ('taat', 28323), ('aatc', 20909), ('atcg', 17802), ('tcga', 18471), ('cgaa', 18938), ('gaag', 14211), ('aagt', 22209), ('aata', 37726), ('atag', 13459), ('tagc', 11193), ('agct', 17042), ('gctg', 18458), ('ctga', 12796), ('tgag', 11527), ('gaga', 13146), ('agaa', 21870), ('gaaa', 32995), ('aaaa', 68008), ('aaac', 29593), ('aacg', 14579), ('acga', 12335), ('gtat', 15208), ('tata', 28661), ('ataa', 34459), ('taac', 15156), ('aaag', 26808), ('aagc', 18135), ('tgac', 10520), ('gacc', 7548), ('acca', 14790), ('ccac', 13212), ('cacc', 11160), ('accc', 7888), ('ccca', 12160), ('ccat', 13741), ('cata', 15459), ('atac', 15329), ('taca', 16873), ('acaa', 26437), ('caaa', 34026), ('atcc', 11651), ('tccc', 9057), ('cccc', 7046), ('ccct', 7319), ('cctc', 7597), ('ctcg', 9766), ('tcgc', 14664), ('cgca', 13913), ('gcaa', 21764), ('caag', 15496), ('aaga', 19885), ('aact',

In [77]:
len(tokenizer.word_counts)

256

In [78]:
#total words we need = 260-40 = 220 (top unique features a.k.a. vocab_size)

In [79]:
len([print(key,value) for key,value in tokenizer.word_counts.items() if value < 12000])

tagc 11193
tgag 11527
tgac 10520
gacc 7548
cacc 11160
accc 7888
atcc 11651
tccc 9057
cccc 7046
ccct 7319
cctc 7597
ctcg 9766
tccg 10311
ccga 11245
cgag 11112
gagt 11914
agtc 10898
gtca 11479
ccgg 7516
cggc 11631
ggct 10693
accg 9254
ccgc 11555
cgac 9481
acct 8853
ctcc 9686
gccc 9258
tagg 6438
aggg 6328
gggc 7810
tcac 11672
taga 11597
gtct 8494
tctc 11935
ggga 8292
cgga 10961
gcta 11333
ggag 11086
gcct 9182
ccta 7164
tcgg 10992
gcgt 10170
cgtg 9820
cgcg 9597
agac 8941
gact 9532
tcta 11662
tacg 8308
gtac 8357
catg 10291
cctt 10725
aggc 9019
gacg 8258
tgtc 11097
ctca 10922
gaca 11320
ctac 8219
tacc 8979
gtcc 7406
tcct 10078
cagg 8815
cctg 9940
tggg 10549
gggt 6781
ggtt 11221
cggg 6004
gggg 5481
ggtc 7659
gtcg 10225
tcgt 11897
agga 11679
ggat 11311
gcgg 11280
ggta 9085
acgt 9062
cgta 8093
ctag 7262
gtag 7855
gagg 8447
ggcg 11209
actc 10748
ggac 7734
gatc 10956
cacg 8626
acgg 8303
cgtc 7956
acgc 10042
cccg 6688
ccgt 8015
aggt 8774
ggtg 11201
ggcc 10044
cggt 9505


93

In [80]:
train_seq=tokenizer.texts_to_sequences(X_train)

In [81]:
X_train[2000]

'AACC ACCG CCGG CGGT GGTT GTTG TTGC TGCT GCTA CTAA TAAA AAAC AACC ACCG CCGT CGTT GTTT TTTC TTCA TCAG CAGC AGCA GCAA CAAT AATG ATGT TGTT GTTA TTAT TATT ATTT TTTA TTAA TAAC AACA ACAG CAGG AGGG GGGT GGTT GTTT TTTG TTGA TGAG GAGT AGTG GTGC TGCT GCTC CTCG TCGC CGCT GCTA CTAA TAAA AAAT AATG ATGG TGGA GGAC GACT ACTT CTTA TTAC TACG ACGA CGAA GAAA AAAT AATG ATGA TGAG GAGT AGTA GTAT TATG ATGG TGGA GGAA GAAT AATT ATTT TTTT TTTC TTCG TCGT CGTG GTGC TGCA GCAG CAGC AGCC GCCC CCCA CCAG CAGT AGTC GTCA TCAT CATG ATGG TGGT GGTC GTCA TCAC CACG ACGC CGCT GCTG CTGC TGCC GCCT CCTC CTCT TCTC CTCT TCTG CTGA TGAA GAAA AAAA AAAG AAGC AGCC GCCG CCGC CGCA GCAA CAAG AAGA AGAA GAAA AAAA AAAT AATA ATAA TAAA AAAA AAAA AAAT AATA ATAA TAAA AAAT AATA ATAT TATA ATAT TATC ATCT TCTT CTTT TTTC TTCT TCTA CTAG TAGC AGCT GCTA CTAT TATT ATTA TTAT TATT ATTA TTAT TATT ATTG TTGT TGTT GTTG TTGT TGTT GTTA TTAT TATT ATTA TTAT TATT ATTA TTAA TAAC AACA ACAA CAAA AAAA AAAA AAAT AATG ATGT TGTC GTCG TCGG CGGA GGAA GAAA AAAC AACA ACAG CAGT

In [82]:
len(train_seq[2000])

298

In [83]:
train_padded=pad_sequences(train_seq,maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [84]:
len(train_padded[10000])

200

In [85]:
test_seq=tokenizer.texts_to_sequences(X_test)
test_padded=pad_sequences(test_seq,maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [86]:
le=LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.transform(y_test)

In [87]:
y_train=to_categorical(y_train,2)
y_test=to_categorical(y_test,2)



In [88]:
early_stop = EarlyStopping(monitor='val_loss', patience= 7, min_delta= 0.01, restore_best_weights=True) 

In [99]:
# model = Sequential()



# model.add(Conv1D(filters = 32,kernel_size = 4, padding ='same', strides=1, activation = 'relu',input_shape = (max_length,1)))
# model.add(MaxPooling1D(pool_size=2))

# model.add(Conv1D(filters = 32,kernel_size = 4, padding ='same', strides=1, activation = 'relu'))
# model.add(MaxPooling1D(pool_size=2))

# model.add(Dense(128, activation='relu'))
# model.add(Dense(128, activation='relu'))
# model.add(Dense(64, activation='relu'))

# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.4))


# model.add(Dense(32,activation='relu'))

# model.add(Dense(1, activation='sigmoid'))

# model = Sequential()

# model.add(Embedding(vocab_size, embedding_dim, input_length = max_length)) 
# # embedding dim - model will take care of embedding, creating 64 dimension vector to compare all words
# model.add(Bidirectional(LSTM(16,return_sequences=True))) ## LSTM - by default activation is tanH and recurrent activation = sigmoid
# model.add(Bidirectional(LSTM(8)))

# model.add(Dense(32,activation='relu'))

# model.add(Dense(2, activation='softmax'))

model=Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Bidirectional(LSTM(8)))
model.add(Dropout(0.2))
model.add(Dense(2,activation='softmax'))

In [100]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc']) #binary_crossentropy

In [101]:
train_padded2 = train_padded.reshape(train_padded.shape[0], max_length,1)
test_padded2 = test_padded.reshape(test_padded.shape[0], max_length,1)

In [102]:
history=model.fit(train_padded,y_train,validation_data=(test_padded,y_test),batch_size=128, epochs=50, callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50


In [None]:
## 0.9133 abd 0.8588

In [None]:
testmain_df = pd.read_csv('data/testSam.csv')


In [None]:
sequence=[seq for seq in testmain_df['Sequence']]

words_test_promo = []
for row in sequence_promo:
    words_test_promo.append([(row[i:i+4]) for i in range(len(row)-3)])

for i in range(len(words_test_promo)):
    words_test_promo[i] = " ".join(words_test_promo[i])

testmain_df_plus=pd.DataFrame(words_test_promo)
testmain_df_plus = testmain_df_plus.reset_index(drop=True)
testmain_df_plus.rename({0:'seq'},axis=1, inplace=True)
testmain_df_plus['seq']

In [None]:
#tokenizer.fit_on_texts(testmain_df_plus['seq'])
testmain_seq=tokenizer.texts_to_sequences(testmain_df_plus['seq'])
testmain_padded=pad_sequences(testmain_seq,maxlen=max_length, padding=padding_type, truncating=trunc_type)
testmain_padded

In [None]:
y_pred=model.predict(testmain_padded)
len(y_pred)

In [None]:
y_pred

In [None]:
y_pred_plus=[0 if y_pred[i][0]>=0.5 else 1 for i in range(0,len(y_pred))]
y_pred_plus 

In [None]:
testmain_df['predictions']=y_pred_plus

In [None]:
testmain_df.to_csv('./data/testSam_RNN.csv',index=False)

In [None]:
test_data.to_csv('./data/testSam.csv',index=False)