In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [None]:
import numpy as np
import pandas as pd
data = pd.read_csv('pdb_data_seq.csv')

data.head()


In [None]:
data = data.dropna()
print(data.shape)
print(data.isnull().sum())
data = data.values

In [None]:
new_data = []
for i in data:
    new_data.append([i[2], i[4]])
len(new_data)

In [None]:
new_data = []
for i in range(len(data)):
    if data[i][4] == '':
        continue
    elif data[i][4] == 'DNA' or data[i][4]== 'Protein'or data[i][4] == 'RNA':
        new_data.append([data[i][2],data[i][4]])
    else:
        new_data.append([data[i][2],'Hybrid'])
        
new_data = np.array(new_data)
new_data.shape


In [None]:
data = pd.DataFrame(new_data, columns=['sequence', 'type'])

In [None]:
data.head()

In [None]:
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

# maximum length of sequence, everything afterwards is discarded!
max_length = 128
seqs = data.sequence.values

#create and fit tokenizer
tokenizer = Tokenizer(char_level=True)

tokenizer.fit_on_texts(seqs)

#represent input data as word rank number sequences
X = tokenizer.texts_to_sequences(seqs)


X = sequence.pad_sequences(X, maxlen=max_length)


In [None]:
from sklearn.preprocessing import LabelBinarizer

# Transform labels to one-hot
lb = LabelBinarizer()

Y = lb.fit_transform(data.type)


In [None]:
len(Y[0])

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, BatchNormalization
from keras.layers import LSTM, Dropout
from keras.layers.embeddings import Embedding

embedding_dim = 50

# create the model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, embedding_dim, input_length=max_length))
model.add(Conv1D(filters=64, kernel_size=6, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.1)
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=128)

In [None]:
%matplotlib inline
import matplotlib as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import itertools

train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
print("train-acc = " + str(accuracy_score(np.argmax(y_train, axis=1), np.argmax(train_pred, axis=1))*100))
print("test-acc = " + str(accuracy_score(np.argmax(y_test, axis=1), np.argmax(test_pred, axis=1))*100))



In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# Compute confusion matrix
cm = confusion_matrix(np.argmax(y_test, axis=1), np.argmax(test_pred, axis=1))

# Plot normalized confusion matrix
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
np.set_printoptions(precision=2)
plt.figure(figsize=(10,10))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(lb.classes_))
plt.xticks(tick_marks, lb.classes_, rotation=0)
plt.yticks(tick_marks, lb.classes_)
#for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#    plt.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color="white" if cm[i, j] > cm.max() / 2. else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

print(classification_report(np.argmax(y_test, axis=1), np.argmax(test_pred, axis=1), target_names=lb.classes_))

In [None]:
history.history.keys()

In [None]:
history_dict = history.history
history_dict1 = history_lstm.history
history_dict.keys()

acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

plt.clf()   # clear figure

plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()