# 2019-02-07 Getting started
The purpose is to build an Artificial Intelligence that is capable of discovering which genome a particular sequence belongs to.

I will start from a test case: Drosophila versus Human. I have on my computer the Drosophila and Human genomes downloaded.

In [None]:
import numpy as np
from tqdm import tqdm

In [None]:
# will use biophython to deal with biological sequences
from Bio import SeqIO, Seq
from Bio.Alphabet import IUPAC

In [None]:
# load human genome
hg38_genome_file = '/mnt/shared/seq/GRCh38/hg38.fasta'
h38 = SeqIO.index(hg38_genome_file,'fasta', alphabet=IUPAC.unambiguous_dna)
h38_chromosomes = [k for k in h38.keys()]

In [None]:
# load drosophila genome
dm4R6_genome_file = '/mnt/shared/seq/dm4R6/dmel-all-chromosome-r6.04.fasta'
dm4R6 = SeqIO.index(dm4R6_genome_file,'fasta', alphabet=IUPAC.unambiguous_dna)
dm4R6_chromosomes = [k for k in dm4R6.keys()]

We need a function that extracts a random sequence of nucleotides.

In [None]:
def get_random_sequence(genome, chromosomes, l, N) :
    """
    Get a random sequence of nucleotides from the genome. It will
    be a sequence of N words of l letters.
    """
    
    # first, pick a random chromosome
    chromosome = genome[np.random.choice(chromosomes)]
    
    # then pick a random spot in the chromosome
    found = False
    chromosome_size = len(chromosome.seq)
    while not found :
        start = np.random.choice(chromosome_size)
        myseq = chromosome.seq[start:start+l*N]
        
        # here we take care that there are no 'N's in the sequence,
        # and that the length of the sequence is good
        if 'N' not in myseq and len(myseq)==l*N :
            found = True
    s = str(myseq).upper()
    return [s[0+i:l+i] for i in range(0, l*N, l)]

We then need a function that takes a sequence and encodes for a number.

In [None]:
def sequence_encoder(sequence, mapping) :
    l = len(sequence)
    return np.sum([4**(l-i-1)*mapping[sequence[i]] for i in range(l)])

In [None]:
l = 4
N = 10
s = get_random_sequence(dm4R6, dm4R6_chromosomes, l, N)

In [None]:
mapping = {
    'A' : 0,
    'T' : 1,
    'C' : 2,
    'G' : 3
}

In [None]:
sequence_encoder('AAAAAA', mapping)

## Generate data
We now want to generate the data that we will use for training the network.

In [None]:
ndata = 100000
l = 7
N = 80
human_fname = '../data/human.dataset'
droso_fname = '../data/droso.dataset'
fh = open(human_fname, 'w')
fd = open(droso_fname, 'w')
for n in tqdm(range(ndata)) :
    human_seq = get_random_sequence(h38, h38_chromosomes, l, N)
    droso_seq = get_random_sequence(dm4R6, dm4R6_chromosomes, l, N)
    for i in range(N) :
        fh.write('%d\t'%(sequence_encoder(human_seq[i], mapping)))
        fd.write('%d\t'%(sequence_encoder(droso_seq[i], mapping)))
    fh.write('\n')
    fd.write('\n')
fh.close()
fd.close()

In [None]:
sequence_encoder('CGAG', mapping)