In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

def parse_fasta(file):
    genomes = {}
    with open(file, "r") as f:
        for line in f:
            line = line.replace('\n', '')
            if line.startswith(">"):
                curr = line
                genomes[curr] = ''
                continue
            genomes[curr] = genomes[curr] + line
    return genomes

type_1_fa = 'dengue_1.fa'
type_2_fa = 'dengue_2.fa'
type_1 = parse_fasta(type_1_fa)
type_2 = parse_fasta(type_2_fa)

all_sequences = [type_1[a] for a in type_1] + [type_2[a] for a in type_2]

In [30]:
print(len(type_1))
print(len(type_2))
print(len(all_sequences))

2203
1702
3905


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='char')
X = count_vect.fit_transform(all_sequences)
chars = count_vect.get_feature_names()

In [4]:
chars

['a', 'b', 'c', 'd', 'g', 'h', 'k', 'm', 'n', 'r', 's', 't', 'v', 'w', 'y']

In [5]:
# 5-gram features generation
from sklearn.feature_extraction.text import TfidfTransformer
count_vect = CountVectorizer(analyzer='char',ngram_range=(3,5))
X = count_vect.fit_transform(all_sequences)
chars = count_vect.get_feature_names()
five_gram = X.toarray()
tf_transformer = TfidfTransformer(use_idf=True).fit(five_gram) #Enable inverse-document-frequency reweighting
five_gram_tf = tf_transformer.transform(five_gram)
five_gram_tf = five_gram_tf.toarray()

In [12]:
np.set_printoptions(threshold=10)

In [10]:
print(five_gram_tf.shape)

(3905, 6739)


In [6]:
chars

['aaa',
 'aaaa',
 'aaaaa',
 'aaaab',
 'aaaac',
 'aaaad',
 'aaaag',
 'aaaah',
 'aaaak',
 'aaaam',
 'aaaan',
 'aaaar',
 'aaaas',
 'aaaat',
 'aaaaw',
 'aaaay',
 'aaab',
 'aaaba',
 'aaac',
 'aaaca',
 'aaacc',
 'aaacg',
 'aaacn',
 'aaacr',
 'aaact',
 'aaacy',
 'aaad',
 'aaadc',
 'aaag',
 'aaaga',
 'aaagc',
 'aaagg',
 'aaagk',
 'aaagn',
 'aaagr',
 'aaagt',
 'aaagy',
 'aaah',
 'aaaha',
 'aaahc',
 'aaak',
 'aaaka',
 'aaakc',
 'aaakt',
 'aaam',
 'aaama',
 'aaamc',
 'aaamg',
 'aaan',
 'aaanc',
 'aaang',
 'aaant',
 'aaar',
 'aaara',
 'aaarc',
 'aaarg',
 'aaarr',
 'aaart',
 'aaary',
 'aaas',
 'aaasa',
 'aaasc',
 'aaat',
 'aaata',
 'aaatc',
 'aaatg',
 'aaatm',
 'aaatn',
 'aaatr',
 'aaatt',
 'aaaty',
 'aaaw',
 'aaawc',
 'aaawt',
 'aaay',
 'aaaya',
 'aaayc',
 'aaayg',
 'aaayt',
 'aab',
 'aaba',
 'aabaa',
 'aac',
 'aaca',
 'aacaa',
 'aacac',
 'aacag',
 'aacak',
 'aacam',
 'aacan',
 'aacar',
 'aacat',
 'aacay',
 'aacc',
 'aacca',
 'aaccc',
 'aaccg',
 'aacck',
 'aaccn',
 'aaccr',
 'aacct',
 'aaccw',
 'a

In [11]:
print(len(type_1))
print(len(type_2))

2203
1702


In [23]:
sum(five_gram[0] == 0)

5408

In [24]:
sum(five_gram_tf[0] == 0)

5408

In [21]:
max(five_gram_tf[8])

0.21198649481042328

In [38]:
labels = np.array([0 for i in type_1] + [1 for i in type_2])

In [39]:
len(labels)

3905

In [35]:
from sklearn.model_selection import train_test_split

In [42]:
X_train, X_test, y_train, y_test = train_test_split(five_gram_tf, labels, shuffle=True, test_size=0.3, random_state=46)
X_train = X_train[:,:,np.newaxis]
X_test = X_test[:,:,np.newaxis]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2733, 6739, 1)
(2733,)
(1172, 6739, 1)
(1172,)


In [41]:
y_train

array([0, 1, 1, ..., 0, 1, 1])

In [43]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling2D,MaxPooling1D

In [49]:
def deep_cnn():
    model = Sequential()
    model.add(Conv1D(filters=8,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=16,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=32,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=64,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=128,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(256,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(128,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(64,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(1,activation=None))
    model.add(Activation('sigmoid'))
    return model

In [50]:
model = deep_cnn()
metrics = ['acc']
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=metrics)

In [51]:
model.fit(X_train,y_train,batch_size=64,epochs=20,validation_split=0.2,shuffle=True)

Train on 2186 samples, validate on 547 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1ab03409a48>

In [48]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              multiple                  48        
_________________________________________________________________
batch_normalization (BatchNo multiple                  32        
_________________________________________________________________
activation (Activation)      multiple                  0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) multiple                  0         
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  656       
_________________________________________________________________
batch_normalization_1 (Batch multiple                  64        
_________________________________________________________________
activation_1 (Activation)    multiple                  0

In [53]:
model.predict(X_test[:100])

array([[0.9764712 ],
       [0.9586891 ],
       [0.95392287],
       [0.9753574 ],
       [0.95199347],
       [0.99687874],
       [0.9371993 ],
       [0.99867547],
       [0.9961858 ],
       [0.934059  ],
       [0.98495626],
       [0.9972213 ],
       [0.9961424 ],
       [0.99667025],
       [0.9776434 ],
       [0.9855871 ],
       [0.9958124 ],
       [0.9611974 ],
       [0.96472   ],
       [0.9960613 ],
       [0.9985901 ],
       [0.9983041 ],
       [0.9650824 ],
       [0.9964684 ],
       [0.9977435 ],
       [0.9794482 ],
       [0.99688303],
       [0.9963449 ],
       [0.9740546 ],
       [0.96482474],
       [0.9498757 ],
       [0.9386309 ],
       [0.99623656],
       [0.96985173],
       [0.95050514],
       [0.97305757],
       [0.9985273 ],
       [0.9964488 ],
       [0.9780093 ],
       [0.98245966],
       [0.9588425 ],
       [0.995592  ],
       [0.9797398 ],
       [0.96747226],
       [0.99638   ],
       [0.95584303],
       [0.9379532 ],
       [0.939

In [54]:
y_test

array([0, 0, 0, ..., 1, 1, 0])