In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

def parse_fasta(file):
    genomes = {}
    with open(file, "r") as f:
        for line in f:
            line = line.replace('\n', '')
            if line.startswith(">"):
                curr = line
                genomes[curr] = ''
                continue
            genomes[curr] = genomes[curr] + line
    return genomes

B_fa = 'hiv_subtypeB.fasta'
C_fa = 'hiv_subtypeC.fasta'
typeB = parse_fasta(B_fa)
typeC = parse_fasta(C_fa)
print(len(typeB))
print(len(typeC))

5460
1641


In [8]:
import random
typeB = [typeB[a] for a in typeB]
random.shuffle(typeB)
print(len(typeB))
typeB = typeB[:2000]
print(len(typeB))

5460
2000


In [9]:
all_sequences = typeB + [typeC[a] for a in typeC]
print(len(all_sequences))

3641


In [10]:
labels = np.array([0 for i in typeB] + [1 for i in typeC])
len(labels)

3641

In [13]:
max(len(a) for a in all_sequences)

14825

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='char')
X = count_vect.fit_transform(all_sequences)
chars = count_vect.get_feature_names()

In [15]:
chars

['a', 'b', 'c', 'd', 'g', 'h', 'k', 'm', 'n', 'r', 's', 't', 'v', 'w', 'y']

In [16]:
# 5-gram features generation
from sklearn.feature_extraction.text import TfidfTransformer
count_vect = CountVectorizer(analyzer='char',ngram_range=(5,6))
X = count_vect.fit_transform(all_sequences)
chars = count_vect.get_feature_names()
five_gram = X.toarray()
tf_transformer = TfidfTransformer(use_idf=True).fit(five_gram) #Enable inverse-document-frequency reweighting
five_gram_tf = tf_transformer.transform(five_gram)
five_gram_tf = five_gram_tf.toarray()

In [21]:
print(five_gram_tf.shape)

(3641, 25371)


In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(five_gram_tf, labels, shuffle=True, test_size=0.3, random_state=46)
#X_train = X_train[:,:,np.newaxis]
#X_test = X_test[:,:,np.newaxis]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2548, 25371)
(2548,)
(1093, 25371)
(1093,)


In [24]:
y_train

array([0, 1, 0, ..., 0, 1, 1])

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling2D,MaxPooling1D

# K-MER (3-5) WITH CNN

In [35]:
def deep_cnn():
    model = Sequential()
    model.add(Conv1D(filters=8,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=16,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=32,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=64,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=128,kernel_size=5,padding='same',activation=None))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(256,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(128,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(64,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(1,activation=None))
    model.add(Activation('sigmoid'))
    return model

In [50]:
model = deep_cnn()
metrics = ['acc']
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=metrics)

In [17]:
#model.fit(X_train,y_train,batch_size=64,epochs=20,validation_split=0.2,shuffle=True)

In [18]:
#model.summary()

In [19]:
#model.predict(X_test[:100])

# K-MER (4-6) WITH SIMPLE DENSE LAYERS

In [26]:
def simple_MLP():
    model = Sequential()
    model.add(Dense(2048,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(1024,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(512,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(256,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(128,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(64,activation='relu'))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(1,activation=None))
    model.add(Activation('sigmoid'))
    return model

In [28]:
model = simple_MLP()
metrics = ['acc']
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=metrics)

In [29]:
model.fit(X_train,y_train,batch_size=64,epochs=20,validation_split=0.2,shuffle=True)

Train on 2038 samples, validate on 510 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1f87f6a7d48>

In [30]:
model.predict(X_test[:50])

array([[9.9975812e-01],
       [9.9980348e-01],
       [2.0113587e-04],
       [9.9988329e-01],
       [9.9971592e-01],
       [1.9609928e-04],
       [1.9085407e-04],
       [9.9979329e-01],
       [1.8319488e-04],
       [1.7747283e-04],
       [1.7100573e-04],
       [9.9984157e-01],
       [2.2399426e-04],
       [1.5261769e-04],
       [9.9984813e-01],
       [9.9942172e-01],
       [1.8125772e-04],
       [9.9978197e-01],
       [9.9974126e-01],
       [9.9978924e-01],
       [9.9971879e-01],
       [9.9978888e-01],
       [9.9981296e-01],
       [9.9984956e-01],
       [9.9981511e-01],
       [1.6427040e-04],
       [2.0429492e-04],
       [9.9980628e-01],
       [9.9964178e-01],
       [2.1326542e-04],
       [9.9967927e-01],
       [9.9979997e-01],
       [9.9983025e-01],
       [9.9979722e-01],
       [2.0909309e-04],
       [9.9976450e-01],
       [9.9970889e-01],
       [1.8906593e-04],
       [1.7315149e-04],
       [1.9955635e-04],
       [1.8531084e-04],
       [1.628696

In [33]:
#score_kmer = model.evaluate(X_test, y_test)
print('mlp k-mer test loss, test acc:', score_kmer)

mlp k-mer test loss, test acc: [0.00021441213504936588, 1.0]


In [34]:
# Reset metrics before saving so that loaded model has same state,
# since metric states are not preserved by Model.save_weights
model.reset_metrics()
model.save('saved_models/kmer_mlp_hiv.h5')

# ASCII ENCODING WITH CNN

In [36]:
def encode_with_ascii(sequences):
    length = max([len(s) for s in sequences])
    result = np.zeros((len(sequences), length))
    print(result.shape)
    for i in range(len(sequences)):
        for j in range(length):
            c = 0
            if j < len(sequences[i]):
                c = ord(sequences[i][j])
            result[i, j] = c
    print("hello")
    return result

In [37]:
ascii_en = encode_with_ascii(all_sequences)

(3641, 14825)
hello


In [38]:
X_train, X_test, y_train, y_test = train_test_split(ascii_en, labels, shuffle=True, test_size=0.3, random_state=46)
X_train = X_train[:,:,np.newaxis]
X_test = X_test[:,:,np.newaxis]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2548, 14825, 1)
(2548,)
(1093, 14825, 1)
(1093,)


In [39]:
model = deep_cnn()
metrics = ['acc']
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=metrics)
model.fit(X_train,y_train,batch_size=64,epochs=20,validation_split=0.2,shuffle=True)

Train on 2038 samples, validate on 510 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
 128/2038 [>.............................] - ETA: 1:00 - loss: 0.0157 - acc: 1.0000

KeyboardInterrupt: 

In [40]:
model.predict(X_test[:50])

array([[9.94860053e-01],
       [9.67233658e-01],
       [1.63704157e-04],
       [9.97560978e-01],
       [8.85849297e-01],
       [1.13636255e-04],
       [2.88695097e-04],
       [9.94378328e-01],
       [9.20593739e-05],
       [1.32292509e-04],
       [2.64167786e-04],
       [9.98015344e-01],
       [1.60038471e-04],
       [1.02818012e-04],
       [9.87549424e-01],
       [9.93807197e-01],
       [9.16421413e-05],
       [9.48706508e-01],
       [9.53589916e-01],
       [9.77461934e-01],
       [9.90941405e-01],
       [9.97562289e-01],
       [9.99549210e-01],
       [9.99529362e-01],
       [9.91274953e-01],
       [1.28746033e-04],
       [3.69071960e-03],
       [9.97097790e-01],
       [9.77784276e-01],
       [1.71378255e-03],
       [9.91498351e-01],
       [9.94295478e-01],
       [9.98548269e-01],
       [9.69663560e-01],
       [1.30146742e-04],
       [8.61543894e-01],
       [9.21850145e-01],
       [9.33408737e-05],
       [1.37031078e-04],
       [1.48713589e-04],


In [41]:
y_test[:50]

array([1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 1])

In [44]:
#score_ascii = model.evaluate(X_test, y_test)
print('test loss, test acc:', score_ascii)

test loss, test acc: [0.046434312636221865, 0.9853614]


In [45]:
# Reset metrics before saving so that loaded model has same state,
# since metric states are not preserved by Model.save_weights
model.reset_metrics()

In [46]:
model.save('saved_models/ascii_cnn.h5')

In [47]:
# # put ascii code in between -1 and 1
# for i in range(ascii_en.shape[0]):
#     for j in range(ascii_en.shape[1]):
#         if ascii_en[i,j] == 0:
#             continue
#         ascii_en[i, j] = (ascii_en[i, j]-(65+25/2))/(25/2)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(ascii_en, labels, shuffle=True, test_size=0.3, random_state=46)
# X_train = X_train[:,:,np.newaxis]
# X_test = X_test[:,:,np.newaxis]
# print(X_train.shape)
# print(y_train.shape)
# print(X_test.shape)
# print(y_test.shape)

# model = deep_cnn()
# metrics = ['acc']
# model.compile(optimizer='adam',loss='binary_crossentropy',metrics=metrics)
# model.fit(X_train,y_train,batch_size=64,epochs=20,validation_split=0.2,shuffle=True)