In [1]:
import pandas as pd
import random
import numpy as np

from numpy import argmax
from tensorflow import keras
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Conv2D, Dense, MaxPooling2D, Dropout, Flatten, BatchNormalization

In [2]:
def GenSeqs():
    alphabet = 'GATC'
    bindingSeq = 'GGAATTCCTTAAGGAATTCCTTAAGGAATTCCTTAAGG'
    bindingLen = len(bindingSeq)
    hasBinding = 1
    withBinding = 1000-len(bindingSeq)
    
    totalSeqs = pd.DataFrame(columns = ['Names', 'Sequences', 'Labels'])
    
    for x in range(2500):
        crm = []
        seq = ''
        for i in range(withBinding+1):
            if(i == withBinding/2):
                seq = seq + bindingSeq
            else:
                seq = seq + alphabet[random.randint(0, 3)]
        crm.append('CRM seq ' + str(x+1))
        crm.append(seq)
        crm.append(1)
        totalSeqs.loc[len(totalSeqs)] = crm
    
    for x in range(2500):
        nonCrm = []
        seq = ''
        for i in range(1000):
            seq = seq + alphabet[random.randint(0, 3)]
        nonCrm.append('Non-CRM seq ' + str(x+1))
        nonCrm.append(seq)
        nonCrm.append(0)
        totalSeqs.loc[len(totalSeqs)] = nonCrm
    
    print(totalSeqs)
    
    return totalSeqs

In [3]:
def OneHot(data):
    num_classes = 4
    new_data = []

    for x in data:
        class_vector = np.array(x)
        categorical = np.zeros(class_vector.shape+(num_classes,))
        for c in range(1,5,1):
            categorical[np.where(class_vector == c)]=np.array([1 if i == c else 0.0 for i in range(1,5,1)])
        new_data.append(categorical)
        
    return new_data

In [4]:
# Returns Pandas dataframe of names, sequences, and labels
totalSeqs = GenSeqs()

# Splitting data and labels in to train/validation sets
x_tr, x_val, y_tr, y_val = train_test_split(totalSeqs.Sequences.tolist(), totalSeqs.Labels.tolist(), test_size = 0.1)
x_tr, x_val, y_tr, y_val = np.array(x_tr), np.array(x_val), np.array(y_tr), np.array(y_val)

                 Names                                          Sequences  \
0            CRM seq 1  CGTCATCTTCCCAAACCATACATTCTCTGAAGTCCTGGTCCAGTAG...   
1            CRM seq 2  ATGGCAATTGATAAGCACCGTCCGTCGGGTGTTTCAACCCTACACT...   
2            CRM seq 3  TTCTCAAGATCCCCGGCCGTCTATTGACCGCCAACGACTGCACGAG...   
3            CRM seq 4  TCTTTGGTACAATTACATAAGCATCCAAGGGGGGGACCCTTGAAAA...   
4            CRM seq 5  TGCTGTCCTGAATACGAAGTAATGTCGACGCCCAACATGCTATGCC...   
...                ...                                                ...   
4995  Non-CRM seq 2496  TAGGATGATTCCAGGGATCGAATGGAGAGCGATAAGGCACTTTCGC...   
4996  Non-CRM seq 2497  GATCCTGGCAGCGTATCCCTCAAACTTGGCCCGATGTCCCTAGCCC...   
4997  Non-CRM seq 2498  CAAATGGACCGCCAGAAAAAGTGTACCAACGTAAACGCGACCTACT...   
4998  Non-CRM seq 2499  CTAATTAATGTCGTACAGGGGAGTGACTAACTGGCGGCCCTTTTCA...   
4999  Non-CRM seq 2500  TTGGATCTGACGACTCTTAAATTACAGGGCTCTGCCAATACAGAGT...   

     Labels  
0         1  
1         1  
2         1  
3         1  
4    

In [5]:
#Tokenizing sequences
tk = Tokenizer(num_words=None, char_level=True)
tk.fit_on_texts(x_tr)
tokenTrain = tk.texts_to_sequences(x_tr)

In [6]:
print(tk.word_counts)
print(tk.word_index)
print(tk.word_docs)
print(tokenTrain[0])

OrderedDict([('c', 1117375), ('t', 1130842), ('a', 1129907), ('g', 1121876)])
{'t': 1, 'a': 2, 'g': 3, 'c': 4}
defaultdict(<class 'int'>, {'c': 4500, 'g': 4500, 'a': 4500, 't': 4500})
[4, 1, 2, 2, 4, 4, 2, 4, 2, 2, 4, 1, 4, 3, 3, 1, 1, 1, 3, 4, 2, 4, 2, 4, 1, 4, 2, 4, 4, 1, 3, 1, 2, 4, 3, 4, 4, 3, 1, 4, 4, 3, 2, 2, 2, 2, 2, 1, 3, 2, 4, 3, 3, 3, 2, 2, 3, 1, 4, 3, 1, 2, 1, 3, 3, 4, 4, 4, 4, 4, 3, 3, 4, 4, 3, 4, 4, 2, 3, 1, 4, 1, 2, 4, 1, 3, 4, 4, 4, 3, 1, 4, 4, 1, 3, 3, 3, 1, 3, 3, 4, 4, 1, 4, 2, 1, 3, 2, 1, 1, 3, 2, 2, 4, 2, 2, 1, 2, 4, 4, 2, 4, 4, 3, 1, 1, 2, 4, 3, 1, 2, 4, 4, 1, 2, 2, 1, 2, 3, 2, 1, 4, 3, 4, 2, 3, 1, 1, 3, 1, 3, 1, 2, 3, 2, 3, 3, 4, 1, 3, 2, 4, 4, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 3, 2, 3, 2, 4, 2, 4, 2, 1, 4, 2, 2, 1, 3, 1, 2, 2, 3, 1, 2, 2, 4, 2, 2, 4, 2, 3, 3, 1, 3, 4, 3, 4, 3, 1, 2, 4, 3, 3, 3, 4, 2, 4, 2, 3, 1, 4, 2, 1, 1, 4, 3, 1, 2, 1, 1, 4, 3, 1, 1, 3, 1, 1, 1, 2, 1, 3, 2, 3, 3, 1, 3, 1, 3, 1, 1, 3, 3, 3, 1, 4, 4, 4, 1, 2, 1, 2, 3, 4, 1, 2, 2, 1, 1, 4, 4, 4, 2, 2,

In [7]:
# Onehot encoding tokenized sequences
oneHotTrain = OneHot(tokenTrain)

In [13]:
# Resizing to fit Conv2D and making sure there aren't any array/list conflicts
oneHotTrain = np.array(oneHotTrain).reshape(4500, 1000, 4, 1).astype('float32')
#for x in oneHotTrain:
    #x = np.array(x)
    #for i in x:
        #i = np.array(i)
        #for j in i:
            #j = np.array(j)
print(oneHotTrain.shape)

trainLabels = np.array(y_tr).reshape(4500, 1)
#for x in trainLabels:
    #x = np.array(x)
    #for i in x:
        #i = np.array(i)
        #for j in i:
            #j = np.array(j)
print(trainLabels.shape)

print(oneHotTrain)
print(trainLabels)

(4500, 1000, 4, 1)
(4500, 1)
[[[[0.]
   [0.]
   [0.]
   [1.]]

  [[1.]
   [0.]
   [0.]
   [0.]]

  [[0.]
   [1.]
   [0.]
   [0.]]

  ...

  [[1.]
   [0.]
   [0.]
   [0.]]

  [[0.]
   [1.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   [1.]]]


 [[[1.]
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   [1.]]

  [[1.]
   [0.]
   [0.]
   [0.]]

  ...

  [[0.]
   [0.]
   [0.]
   [1.]]

  [[1.]
   [0.]
   [0.]
   [0.]]

  [[1.]
   [0.]
   [0.]
   [0.]]]


 [[[0.]
   [1.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   [1.]]

  [[0.]
   [0.]
   [1.]
   [0.]]

  ...

  [[0.]
   [0.]
   [1.]
   [0.]]

  [[1.]
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   [1.]]]


 ...


 [[[0.]
   [1.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [1.]
   [0.]]

  [[0.]
   [1.]
   [0.]
   [0.]]

  ...

  [[1.]
   [0.]
   [0.]
   [0.]]

  [[1.]
   [0.]
   [0.]
   [0.]]

  [[1.]
   [0.]
   [0.]
   [0.]]]


 [[[0.]
   [0.]
   [0.]
   [1.]]

  [[0.]
   [0.]
   [0.]
   [1.]]

  [[0.]
   [1.]
   [0.]
   [0.]]

  ...

  [

In [14]:
model = Sequential()
model.add(Conv2D(32, (4,1), activation = 'relu', input_shape = (1000, 4, 1)))
model.add(MaxPooling2D((1,1)))
model.add(Conv2D(64, 4, activation = 'relu'))
model.add(MaxPooling2D(1))
model.add(Flatten())
model.add(Dense(64, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

#optimizer = keras.optimizers.Adam(lr = 0.000001)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()
          
final = model.fit(oneHotTrain, trainLabels, batch_size = 100, epochs = 3, verbose = 1)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 997, 4, 32)        160       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 997, 4, 32)        0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 994, 1, 64)        32832     
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 994, 1, 64)        0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 63616)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4071488   
_________________________________________________________________
dense_5 (Dense)              (None, 1)                