# Generator Quality
 * We trained previously generated two set of random sequences from (human)intergenomic regions `"random_seqs.csv"` and `"random_seqs2.csv"`.
 * We trained CNN model to distinguish a sequence from real genomic sequence from unreal (permuted) sequence (trained on `"random_seqs.csv"`)
 * We also have LSTM generator to give us 1000 pseudo-DNA sequences generated from (trained on `"random_seqs2.csv"`)
 * Now, it is time to evaluation a quality of the generator trained in the previous step.


In [0]:
%tensorflow_version 1.x
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop

SEQ_FILE = "generated_seqs.csv"
MODEL_FILE = "dna_classifier_rsat.loss0.53.h5"


## Step 1) Read DNA sequences

These sequences were generated in a previous step by LSTM network.

In [3]:
df = pd.read_csv(SEQ_FILE)
print('corpus length:', sum(df.generated_seqs.str.len()))
df.head()

corpus length: 200000


Unnamed: 0,generated_seqs
0,TTGTATCATATATATATTTTTTTAAATTTTTTATATACTATTTATA...
1,CAAACTAGAAGTAAAGAAATATAATGCTTAATTTTTTGTTTTAATA...
2,GCACACACACTCACACATATCTGCATTTGTGTGGGCTGAAAGATGT...
3,TACATTGGCACATGCTCCACTACAGGAAGCTGAACTCCCTTTGAGA...
4,TGTGCAGCAGGAATGATTGTGACAATGAGATTGATTTATTTCTTTT...


## Step 2) Read Model
Read CNV classificator. To load the weights, the model must be specified and `input_shape` of the first layer given.

In [6]:
num_chars = 4

model = Sequential()
model.add(layers.Conv1D(num_chars, 8, activation='relu', input_shape=(200, 4)))
#model.add(layers.Dropout(0.1))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(num_chars, 8, activation='relu'))
#model.add(layers.Dropout(0.1))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(1))
model.build()

model.compile(optimizer=RMSprop(lr=2e-3),
              loss='binary_crossentropy',
              metrics=['acc'])


model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_5 (Conv1D)            (None, 193, 4)            132       
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 38, 4)             0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 31, 4)             132       
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 4)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 5         
Total params: 269
Trainable params: 269
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.load_weights(MODEL_FILE)

In [8]:
df.shape

(1000, 1)

## Step 3) Permutation

For each sequence, get a permuted version.

In [9]:
len(df['generated_seqs'][0])

200

In [0]:
def random_str_shuffle(s):
  return ''.join(random.sample(s,len(s)))

In [0]:
df['seq_permuted'] = df.generated_seqs.apply(random_str_shuffle)

In [0]:
def random_str_shuffle_dimers(s):
  dimers = [s[i:i+2] for i in range(0, len(s), 2)]
  return ''.join(random.sample(dimers, len(dimers)))

In [0]:
df['seq_permuted2'] = df.generated_seqs.apply(random_str_shuffle_dimers)

In [0]:
def random_str_shuffle_tetramers(s):
  tetramers = [s[i:i+4] for i in range(0, len(s), 4)]
  return ''.join(random.sample(tetramers, len(tetramers)))

In [0]:
df['seq_permuted4'] = df.generated_seqs.apply(random_str_shuffle_tetramers)

## Step 4) Vectorization

In [16]:
# dictionaries to convert characters to numbers and vice-versa
chars = ['A', 'C', 'T', 'G']
char_to_indices = dict((c, i) for i, c in enumerate(chars))
indices_to_char = dict((i, c) for i, c in enumerate(chars))

seq_length = len(df.generated_seqs[0])
n_seq = df.shape[0]
seq_length, n_seq

(200, 1000)

In [17]:
X = np.zeros((4*n_seq, seq_length, num_chars), dtype=np.bool)
y = np.zeros((4*n_seq), dtype=np.bool)

for i in tqdm(range(n_seq)):
    for j in range(seq_length):
        X[i][j][char_to_indices[df.generated_seqs[i][j]]] = 1
        y[i] = 1
        X[i+n_seq][j][char_to_indices[df.seq_permuted[i][j]]] = 1
        y[i+n_seq] = 0
        X[i+2*n_seq][j][char_to_indices[df.seq_permuted2[i][j]]] = 1
        y[i+2*n_seq] = 0
        X[i+3*n_seq][j][char_to_indices[df.seq_permuted4[i][j]]] = 1
        y[i+3*n_seq] = 0

100%|██████████| 1000/1000 [00:23<00:00, 42.22it/s]


## Step 5) Evaluation on generated sequences

In [18]:
model.evaluate(X[:n_seq,:], y[:n_seq])




[0.17953428530693055, 0.7760000228881836]

In [19]:
y_real_pred = model.predict_classes(X[:n_seq,:])
(y_real_pred[:,0] == 1).mean()

0.929

## Step 6) Evaluation on permuted sequences (negative controls)

In [20]:
model.evaluate(X[n_seq:(2*n_seq),:], y[n_seq:(2*n_seq)])



[6.978405584335327, 0.16300000250339508]

In [21]:
y_unreal_pred = model.predict_classes(X[n_seq:(2*n_seq),:])
(y_unreal_pred[:,0] == 1).mean()

0.837

## Step 7) Evaluation on dimer-permuted sequences (negative controls)

In [22]:
model.evaluate(X[(2*n_seq):(3*n_seq),:], y[(2*n_seq):(3*n_seq)])



[7.260770774841308, 0.13899999856948853]

In [23]:
y_unreal_pred2 = model.predict_classes(X[(2*n_seq):(3*n_seq),:])
(y_unreal_pred2[:,0] == 1).mean()

0.861

## Step 8) Evaluation on tetramer-permuted sequences (negative controls)

In [24]:
model.evaluate(X[(3*n_seq):(4*n_seq),:], y[(3*n_seq):(4*n_seq)])



[7.401709312438965, 0.13899999856948853]

In [25]:
y_unreal_pred4 = model.predict_classes(X[(3*n_seq):(4*n_seq),:])
(y_unreal_pred4[:,0] == 1).mean()

0.861