In [None]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint

# 2019-02-11 Ready for AI
Once the database of RNA-seq values is ready, we can transform it to something that an AI can work with and recognize.

The roadmap is as follows:

1. **Data preprocessing**: Convert the data to a format that an AI can work with
2. **Building the AI**: Find an appropriate network topology, train it, and test it

The first step is partially done, but we still need to do some work.

## Data preprocessing

We want to transform our data set into the following format: a matrix that has dimensions nsamples * ngenes, where nsamples is the number of samples that we will work with, and ngenes is the number of genes we want to take into consideration.

Let's start by building this array.

The code that follows will then be migrated to a separate script, which will create a data structure that we will load later on.

In [None]:
# directories and files
tissue_ai_root = ".."
experiments_dir = "%s/data/experiments"%(tissue_ai_root)
metadata = "%s/data/metadata.txt"%(tissue_ai_root)

In [None]:
# read the metadata and develop a dictionary to map a tissue name to a number
md = pd.read_csv(metadata, sep='\t')
tissues = md['Biosample term name'].unique()
tissues_mapping = {tissues[i] : i for i in range(len(tissues))}

In [None]:
experiment_names = os.listdir(experiments_dir)

In [None]:
# prepare the iteration over all the experiments
quants = []
labels = []
n = 0
for experiment_name in experiment_names :
    
    # check that all the replicates from this experiment accession ID have the same tissue
    md_experiment = md[md['Experiment accession'] == experiment_name]
    tissue_id = md_experiment['Biosample term name'].unique()
    if tissue_id.size != 1 :
        raise ValueError("Experiment %s has replicates from different tissues"%(experiment_name))
    experiment_dir = "%s/%s"%(experiments_dir, experiment_name)
    
    # prepare the iteration over the replicates in the experiment
    replicate_n = 1
    replicate_dir = "%s/replicate-%d-quant"%(experiment_dir, replicate_n)
    while os.path.exists(replicate_dir) :
        quant_fname = "%s/quant-by-gene.tsv"%(replicate_dir)
        print(quant_fname)

        # increment the number of replicates
        replicate_n += 1
        replicate_dir = "%s/replicate-%d-quant"%(experiment_dir, replicate_n)

        # read the file and append it to our list
        quant = pd.read_csv(quant_fname, sep='\t', )
        quants.append(quant['GeneTPM'])
        labels.append(tissues_mapping[tissue_id[0]])
    n += 1
    if n == 4 : break
df = pd.DataFrame(data = quants, index = pd.RangeIndex(start=0, stop=len(quants)))
df['labels'] = labels

## Building the AI

Let's load the data.

In [None]:
dataset_fname = '%s/data/dataset.tsv'%(tissue_ai_root)
dataset = np.loadtxt(dataset_fname, skiprows=1)

In [None]:
# extract the labels
labels = dataset[:, -1]

# remove the first column
dataset = dataset[:,1:-1]

In [None]:
# shape of our input data
ngenes = dataset.shape[1]
ntissues = len(tissues_mapping)

We now extract relevant lines from the data set, and prepare the "training", "validation", and "testing" subsets.

In [None]:
# training
train_data = dataset[0:1000,:]
train_labels = labels[0:1000]

# validation
valid_data = dataset[1000:1500,:]
valid_labels = labels[1000:1500]

# testing
test_data = dataset[1500:,:]
test_labels = labels[1500:]

Before proceeding, since we are dealing with a multi-category labelling problem, we will convert our "labels" into "one-hot" format, which can be interpreted by the AI.

In [None]:
# convert labels into one-hot format using the "to_categorical" function from the keras library
train_labels_onehot = keras.utils.to_categorical(train_labels, ntissues)
valid_labels_onehot = keras.utils.to_categorical(valid_labels, ntissues)
test_labels_onehot = keras.utils.to_categorical(test_labels, ntissues)

We are now ready to build the neural network. The first simple tentative model will be to try to build a simple multi-layer perceptron (MLP) architecture, and see whether the results will be interesting or not.

In [None]:
# build the network
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(ngenes,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(ntissues, activation='softmax'))

Let's have a look at our model with our parameters.

In [None]:
model.summary()

We have currently almost 20 million trainable parameters!

We can now compile our model and provide it with a loss function and an optimizer.

In [None]:
# the model will have a loss function based on the categorical crossentropy, and a
# RMSprop optimizer
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

Now we are ready to go for the training. We will use a checkpointer to keep track of what were the best parameters that were found during the training step.

In [None]:
checkpointer = ModelCheckpoint(filepath='../data/weights.best.MLP.hdf5', 
                               verbose=1, save_best_only=True)
history = model.fit(train_data, train_labels_onehot,
                    batch_size=32,
                    epochs=10,
                    validation_data=(valid_data, valid_labels_onehot),
                    callbacks=[checkpointer])

Let's evaluate the model's performance.

In [None]:
score = model.evaluate(test_data, test_labels_onehot, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

We have an accuracy of 0.4%, which is close to random (1/223 = 0.0046). The most likely explanation for this is that we don't have enough data: we have only ~1700 samples, which means that for each category we have less than 10 data points. It is unlikely that an AI can pick up patterns with such a small data set.

One thing that we can try is to reduce drastically the number of categories. If we have something like 10 categories in total, we might get some better results.