In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

In [None]:
base_to_number = {
    'A' : 0,
    'T' : 1,
    'C' : 2,
    'G' : 3
}

number_to_base = {v : k for k,v in base_to_number.items()}

In [None]:
def sequence_decode(num, mapping, base=4, length=None) :
    N = num
    i = 0
    seq = ''
    while N>0 :
        ai = N%base**(i+1)//base**i
        N -= ai*base**i
        i+=1
        seq += mapping[ai]
    if length is not None :
        seq += mapping[0]*(length-len(seq))
    return seq[::-1]

In [None]:
def sequence_encoder(sequence, mapping) :
    l = len(sequence)
    return np.sum([4**(l-i-1)*mapping[sequence[i]] for i in range(l)])

# 2019-02-08 Training the network

I extracted random sequences from the Drosophila, Mouse, and Human genomes. I want to see whether an AI is capable of distinguishing between these genomes by reading "sentences" of "words" of nucleotides.

First, let's load the data set into a structure that can be used for training the network.

In [None]:
def shuffle_data(data, targets) :
    """
    Takes a nsentences x nwords "data" array and a nsentences-long "targets"
    array and performs a random permutation of the order, preserving the correspondence
    between the row index of the data and the row index of the targets.
    """
    N = data.shape[0]
    perm = np.random.choice(N, size=N, replace=False)
    return data[perm], targets[perm]
    
def prepare_data(genome_datasets, labels, nwords, train_n, valid_n, test_n) :
    
    # init
    N = len(labels)
    train = np.zeros((train_n*N, nwords), dtype=np.int64)
    valid = np.zeros((valid_n*N, nwords), dtype=np.int64)
    test = np.zeros((test_n*N, nwords), dtype=np.int64)
    train_targets = np.zeros(train_n*N, dtype=np.int64)
    valid_targets = np.zeros(valid_n*N, dtype=np.int64)
    test_targets = np.zeros(test_n*N, dtype=np.int64)
    
    # read the datasets
    i = 0
    for genome_dataset, label in zip(genome_datasets, labels) :
        
        # load the "genome sentences" from the dataset supplied
        sentences = np.loadtxt(genome_dataset, dtype=np.int64)
        
        # prepare the train, valid, and test data
        train[i*train_n:(i+1)*train_n, :] = sentences[:train_n, :]
        valid[i*valid_n:(i+1)*valid_n, :] = sentences[train_n:train_n+valid_n, :]
        test[i*test_n:(i+1)*test_n, :] = sentences[train_n+valid_n:train_n+valid_n+test_n, :]
        
        # prepare the targets from the labels
        train_targets[i*train_n:(i+1)*train_n] = label
        valid_targets[i*valid_n:(i+1)*valid_n] = label
        test_targets[i*test_n:(i+1)*test_n] = label
        
        # increment
        i+=1
        
    # perform a random shuffling of the sequences and return it
    train_data, train_targets =  shuffle_data(train, train_targets)
    valid_data, valid_targets = shuffle_data(valid, valid_targets)
    test_data, test_targets = shuffle_data(test, test_targets)
    
    return train_data, train_targets, valid_data, valid_targets, test_data, test_targets

In [None]:
np.random.seed(988754)
l = 7
N = 80
human_dataset_file = '../data/human-l-%d-N-%d.dataset'%(l, N)
droso_dataset_file = '../data/droso-l-%d-N-%d.dataset'%(l, N)
labels = [0, 1]
train_data, train_targets,\
valid_data, valid_targets,\
test_data, test_targets = prepare_data([human_dataset_file,
                                   droso_dataset_file], labels, 80, 80000, 10000, 10000)

## Drosophila versus Human
The first case I want to study is whether the Drosophila and Human genomes can be distinguished or not. This case is probably easier than when including the Mouse genome.

In [None]:
# define the model
dvh_model = Sequential()
dvh_model.add(Embedding(4**l, 128))
dvh_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
dvh_model.add(Dense(1, activation='sigmoid'))

In [None]:
dvh_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
checkpointer = ModelCheckpoint(filepath='weights.best.dvh.hdf5', 
                               verbose=1, save_best_only=True)
dvh_model.fit(train_data, train_targets,
          batch_size=32,
          epochs=2,
          validation_data=(valid_data, valid_targets),
          callbacks=[checkpointer])

In [None]:
score, acc = dvh_model.evaluate(test_data, test_targets,
                            batch_size=32,
                            verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

This is quite incredible! We achieved 98% accuracy from the test set! We are really able to distinguish between Drosophila and Human based on this method.

Which makes me wonder... what makes Drosophila and Human genomes so different?

In [None]:
human_train = train_data[train_targets == 0]
droso_train = train_data[train_targets == 1]

In [None]:
human_hist = np.histogram(human_train.flatten(), bins=np.arange(0,4**l+1,1))[0]
droso_hist = np.histogram(droso_train.flatten(), bins=np.arange(0,4**l+1,1))[0]

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10,5))
axes[0].semilogy(human_hist)
axes[1].semilogy(droso_hist)
axes[0].set_ylabel("Human")
axes[1].set_ylabel("Droso")
plt.show()

So by looking at this graph we can see that the 4^7 = 16384 different 7-nucleotide "words" that compose the "dictionary" that we are currently using are used in a very different way in the two species. This has to do with DNA methylation. In Drosophila, there is no DNA methylation, whereas in humans almost all the "CG" dinucleotides are methylated, and over evolutionary timescales they convert to TG dinucleotide steps.

In [None]:
least_frequent_sequence_human = human_hist.argmin()
most_frequent_sequence_human = human_hist.argmax()
print(sequence_decode(least_frequent_sequence_human, number_to_base, base=4, length=7))
print(sequence_decode(most_frequent_sequence_human, number_to_base, base=4, length=7))

## Drosophila vs Human vs Mouse
Now another question we might ask is whether we can distinguish between Drosophila, Human, and Mouse genomes with the same method.

In [None]:
np.random.seed(3549887)
l = 7
N = 80
human_dataset_file = '../data/human-l-%d-N-%d.dataset'%(l, N)
droso_dataset_file = '../data/droso-l-%d-N-%d.dataset'%(l, N)
mouse_dataset_file = '../data/mouse-l-%d-N-%d.dataset'%(l, N)
labels = [0, 1, 2]
train_data, train_targets,\
valid_data, valid_targets,\
test_data, test_targets = prepare_data([human_dataset_file,
                                   droso_dataset_file,
                                   mouse_dataset_file], labels, 80, 80000, 10000, 10000)

Since now we're dealing with more than two categories, we have to encode our data into one-hot format, and use the "categorical_crossentropy" loss function.

In [None]:
train_targets_onehot = to_categorical(train_targets, len(labels))
valid_targets_onehot = to_categorical(valid_targets, len(labels))
test_targets_onehot = to_categorical(test_targets, len(labels))

In [None]:
# define the model
dmh_model = Sequential()
dmh_model.add(Embedding(4**l, 128))
dmh_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
dmh_model.add(Dense(len(labels), activation='sigmoid'))

In [None]:
dmh_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
checkpointer = ModelCheckpoint(filepath='weights.best.dmh.hdf5', 
                               verbose=1, save_best_only=True)
dmh_model.fit(train_data, train_targets_onehot,
          batch_size=32,
          epochs=10,
          validation_data=(valid_data, valid_targets_onehot),
          callbacks=[checkpointer])

In [None]:
score, acc = dmh_model.evaluate(test_data, test_targets_onehot,
                            batch_size=32,
                            verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)