In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

def parse_fasta(file):
    genomes = {}
    with open(file, "r") as f:
        for line in f:
            line = line.replace('\n', '')
            if line.startswith(">"):
                curr = line
                genomes[curr] = ''
                continue
            genomes[curr] = genomes[curr] + line
    return genomes

type_1_fa = 'dengue_1.fa'
type_2_fa = 'dengue_2.fa'
type_1 = parse_fasta(type_1_fa)
type_2 = parse_fasta(type_2_fa)

all_sequences = [type_1[a] for a in type_1] + [type_2[a] for a in type_2]

In [6]:
labels = np.array([0 for i in type_1] + [1 for i in type_2])
len(labels)

3905

In [2]:
print(len(type_1))
print(len(type_2))
print(len(all_sequences))

2203
1702
3905


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='char')
X = count_vect.fit_transform(all_sequences)
chars = count_vect.get_feature_names()

In [10]:
chars

['a', 'b', 'c', 'd', 'g', 'h', 'k', 'm', 'n', 'r', 's', 't', 'v', 'w', 'y']

In [115]:
# 5-gram features generation
from sklearn.feature_extraction.text import TfidfTransformer
count_vect = CountVectorizer(analyzer='char',ngram_range=(4,6))
X = count_vect.fit_transform(all_sequences)
chars = count_vect.get_feature_names()
five_gram = X.toarray()
tf_transformer = TfidfTransformer(use_idf=True).fit(five_gram) #Enable inverse-document-frequency reweighting
five_gram_tf = tf_transformer.transform(five_gram)
five_gram_tf = five_gram_tf.toarray()

In [12]:
#np.set_printoptions(threshold=10)

In [21]:
np.set_printoptions(threshold=None)

In [116]:
print(five_gram_tf.shape)

(3905, 16621)


In [114]:
#all_sequences[2201]

In [49]:
five_gram[2200][105]

8

In [15]:
five_gram_tf[2203]

array([0.22607261, 0.07762393, 0.02606351, ..., 0.        , 0.        ,
       0.        ])

In [33]:
chars[105]

'aacgc'

In [11]:
print(len(type_1))
print(len(type_2))

2203
1702


In [23]:
sum(five_gram[0] == 0)

5408

In [24]:
sum(five_gram_tf[0] == 0)

5408

In [21]:
max(five_gram_tf[8])

0.21198649481042328

In [93]:
from sklearn.model_selection import train_test_split

In [118]:
X_train, X_test, y_train, y_test = train_test_split(five_gram_tf, labels, shuffle=True, test_size=0.3, random_state=46)
#X_train = X_train[:,:,np.newaxis]
#X_test = X_test[:,:,np.newaxis]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2733, 16621)
(2733,)
(1172, 16621)
(1172,)


In [41]:
y_train

array([0, 1, 1, ..., 0, 1, 1])

In [97]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling2D,MaxPooling1D

# K-MER (3-5) WITH CNN

In [95]:
def deep_cnn():
    model = Sequential()
    model.add(Conv1D(filters=8,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=16,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=32,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=64,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=128,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(256,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(128,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(64,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(1,activation=None))
    model.add(Activation('sigmoid'))
    return model

In [50]:
model = deep_cnn()
metrics = ['acc']
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=metrics)

In [51]:
model.fit(X_train,y_train,batch_size=64,epochs=20,validation_split=0.2,shuffle=True)

Train on 2186 samples, validate on 547 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1ab03409a48>

In [48]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              multiple                  48        
_________________________________________________________________
batch_normalization (BatchNo multiple                  32        
_________________________________________________________________
activation (Activation)      multiple                  0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) multiple                  0         
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  656       
_________________________________________________________________
batch_normalization_1 (Batch multiple                  64        
_________________________________________________________________
activation_1 (Activation)    multiple                  0

In [53]:
model.predict(X_test[:100])

array([[0.9764712 ],
       [0.9586891 ],
       [0.95392287],
       [0.9753574 ],
       [0.95199347],
       [0.99687874],
       [0.9371993 ],
       [0.99867547],
       [0.9961858 ],
       [0.934059  ],
       [0.98495626],
       [0.9972213 ],
       [0.9961424 ],
       [0.99667025],
       [0.9776434 ],
       [0.9855871 ],
       [0.9958124 ],
       [0.9611974 ],
       [0.96472   ],
       [0.9960613 ],
       [0.9985901 ],
       [0.9983041 ],
       [0.9650824 ],
       [0.9964684 ],
       [0.9977435 ],
       [0.9794482 ],
       [0.99688303],
       [0.9963449 ],
       [0.9740546 ],
       [0.96482474],
       [0.9498757 ],
       [0.9386309 ],
       [0.99623656],
       [0.96985173],
       [0.95050514],
       [0.97305757],
       [0.9985273 ],
       [0.9964488 ],
       [0.9780093 ],
       [0.98245966],
       [0.9588425 ],
       [0.995592  ],
       [0.9797398 ],
       [0.96747226],
       [0.99638   ],
       [0.95584303],
       [0.9379532 ],
       [0.939

# K-MER (4-6) WITH SIMPLE DENSE LAYERS

In [119]:
def simple_MLP():
    model = Sequential()
    model.add(Dense(2048,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(1024,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(512,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(256,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(128,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(64,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(1,activation=None))
    model.add(Activation('sigmoid'))
    return model

In [120]:
model = simple_MLP()
metrics = ['acc']
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=metrics)


In [121]:
model.fit(X_train,y_train,batch_size=64,epochs=20,validation_split=0.2,shuffle=True)

Train on 2186 samples, validate on 547 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1d0c7868088>

In [127]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              multiple                  34041856  
_________________________________________________________________
dropout_3 (Dropout)          multiple                  0         
_________________________________________________________________
batch_normalization_8 (Batch multiple                  8192      
_________________________________________________________________
dense_5 (Dense)              multiple                  2098176   
_________________________________________________________________
dropout_4 (Dropout)          multiple                  0         
_________________________________________________________________
batch_normalization_9 (Batch multiple                  4096      
_________________________________________________________________
dense_6 (Dense)              multiple                 

In [122]:
model.predict(X_test[:50])

array([[2.6795268e-04],
       [3.5977364e-04],
       [3.4987926e-04],
       [2.6577711e-04],
       [3.9935112e-04],
       [9.9998796e-01],
       [4.3132901e-04],
       [9.9999189e-01],
       [9.9999064e-01],
       [3.2997131e-04],
       [3.0061603e-04],
       [9.9999213e-01],
       [9.9999058e-01],
       [9.9998915e-01],
       [3.1512976e-04],
       [3.0490756e-04],
       [9.9999094e-01],
       [2.4056435e-04],
       [6.5639615e-04],
       [9.9998844e-01],
       [9.9999213e-01],
       [9.9999297e-01],
       [3.1220913e-04],
       [9.9998820e-01],
       [9.9999201e-01],
       [2.5877357e-04],
       [9.9998987e-01],
       [9.9998248e-01],
       [3.0389428e-04],
       [2.5567412e-04],
       [4.0256977e-04],
       [3.1021237e-04],
       [9.9998844e-01],
       [2.9331446e-04],
       [4.0051341e-04],
       [2.6825070e-04],
       [9.9999255e-01],
       [9.9998850e-01],
       [2.7626753e-04],
       [3.1101704e-04],
       [2.7167797e-04],
       [9.999900

In [129]:
#score_kmer = model.evaluate(X_test, y_test)
print('mlp k-mer test loss, test acc:', score_kmer)

mlp k-mer test loss, test acc: [0.00017948496301853933, 1.0]


In [126]:
# Reset metrics before saving so that loaded model has same state,
# since metric states are not preserved by Model.save_weights
model.reset_metrics()
model.save('saved_models/kmer_mlp.h5')

# ASCII ENCODING WITH CNN

In [89]:
def encode_with_ascii(sequences):
    length = max([len(s) for s in sequences])
    result = np.zeros((len(sequences), length))
    print(result.shape)
    for i in range(len(sequences)):
        for j in range(length):
            c = 0
            if j < len(sequences[i]):
                c = ord(sequences[i][j])
            result[i, j] = c
    print("hello")
    return result

In [56]:
len(all_sequences)

3905

In [85]:
ascii_en = encode_with_ascii(all_sequences)

(3905, 11195)
hello


In [91]:
labels.shape

(3905,)

In [94]:
X_train, X_test, y_train, y_test = train_test_split(ascii_en, labels, shuffle=True, test_size=0.3, random_state=46)
X_train = X_train[:,:,np.newaxis]
X_test = X_test[:,:,np.newaxis]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2733, 11195, 1)
(2733,)
(1172, 11195, 1)
(1172,)


In [98]:
model = deep_cnn()
metrics = ['acc']
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=metrics)
model.fit(X_train,y_train,batch_size=64,epochs=20,validation_split=0.2,shuffle=True)

Train on 2186 samples, validate on 547 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1d0d2725c08>

In [99]:
model.predict(X_test[:50])

array([[0.00156677],
       [0.00156543],
       [0.00127131],
       [0.0016906 ],
       [0.00125459],
       [0.8539931 ],
       [0.00131249],
       [0.8539931 ],
       [0.8539931 ],
       [0.00177988],
       [0.00177145],
       [0.8539931 ],
       [0.8539931 ],
       [0.8539931 ],
       [0.00148046],
       [0.00154701],
       [0.8539931 ],
       [0.0015904 ],
       [0.8539931 ],
       [0.8539931 ],
       [0.8539931 ],
       [0.8539931 ],
       [0.00224245],
       [0.8539931 ],
       [0.8539931 ],
       [0.00146285],
       [0.8539931 ],
       [0.8539931 ],
       [0.00170007],
       [0.001634  ],
       [0.00120255],
       [0.00151673],
       [0.8539931 ],
       [0.00149265],
       [0.00120792],
       [0.00161272],
       [0.8539931 ],
       [0.8539931 ],
       [0.00141895],
       [0.0015308 ],
       [0.00158936],
       [0.8539931 ],
       [0.00167733],
       [0.00176188],
       [0.8539931 ],
       [0.00159222],
       [0.00156546],
       [0.001

In [100]:
y_test[:50]

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0])

In [128]:
#score_ascii = model.evaluate(X_test, y_test)
print('test loss, test acc:', score_ascii)

test loss, test acc: [0.22303773050829007, 0.9215017]


In [104]:
# Reset metrics before saving so that loaded model has same state,
# since metric states are not preserved by Model.save_weights
model.reset_metrics()

In [106]:
model.save('saved_models/ascii_cnn.h5')

In [111]:
# put ascii code in between -1 and 1
for i in range(ascii_en.shape[0]):
    for j in range(ascii_en.shape[1]):
        if ascii_en[i,j] == 0:
            continue
        ascii_en[i, j] = (ascii_en[i, j]-(65+25/2))/(25/2)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(ascii_en, labels, shuffle=True, test_size=0.3, random_state=46)
X_train = X_train[:,:,np.newaxis]
X_test = X_test[:,:,np.newaxis]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

model = deep_cnn()
metrics = ['acc']
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=metrics)
model.fit(X_train,y_train,batch_size=64,epochs=20,validation_split=0.2,shuffle=True)