### Step 1: Set Parameters

In [1]:
import numpy as np
import pandas as pd

In [2]:
N_GENOMES = 5
GENOME_LEN = 100000

N_GENES = 2000
GENE_LEN = 100
P_MUTATION = 0.2

### Step 2: Simulate Genomes

In [3]:
# For now, keep genomes as a string
# This will make simulating the genes more convenient due to Numpy's indexing
genomes = np.random.choice(['A', 'C', 'G', 'T'], (N_GENOMES,GENOME_LEN))

### Step 3: Simulate Genes
For each gene, save the text of the mutated gene, index of the source genome, the start index within the genome, and the number of mutations applied

In [4]:
source_g = np.random.choice(N_GENOMES, N_GENES)
start_i = np.random.choice(GENOME_LEN - GENE_LEN, N_GENES)

genes = pd.DataFrame({"source_g": source_g, "start_i": start_i})
print(genes.head())

   source_g  start_i
0         0    91701
1         2    13828
2         2    26455
3         3    39632
4         0    42832


In [5]:
CHOICES = {
    'A': ['C', 'G', 'T'],
    'C': ['A', 'G', 'T'],
    'G': ['A', 'C', 'T'],
    'T': ['A', 'C', 'G'],
}

def mutate(source_g, start_i, genomes):
    gene = (genomes[source_g, start_i:start_i + GENE_LEN]).copy()
    n_mutations = np.random.binomial(GENE_LEN, P_MUTATION)
    mutation_indices = np.random.choice(GENE_LEN, n_mutations)
    for i in mutation_indices:
        gene[i] = np.random.choice(CHOICES[gene[i]])

    mutations = GENE_LEN - (genomes[source_g, start_i:start_i + GENE_LEN] == gene).sum()
    
    return "".join(gene), mutations
    # return gene, mutations

In [6]:
genes["raw_mutation_result"] = genes.apply(lambda row: mutate(row[0], row[1], genomes), axis=1)
genes["full_text"] = genes["raw_mutation_result"].apply(lambda x: x[0])
genes["n_mutations"] = genes["raw_mutation_result"].apply(lambda x: x[1])
genes.drop("raw_mutation_result", axis=1, inplace=True)

# Now that we're done simulating genes, join genome as a string
genomes = np.apply_along_axis(lambda row: "".join(row),1,genomes)

In [7]:
for i, genome in enumerate(genomes):
    fname = "../data/genome_{}_{}_{}_{}_{}_{}.txt".format(N_GENOMES, GENOME_LEN, N_GENES, GENE_LEN, P_MUTATION, i)
    with open(fname, "w") as f:
        f.write(genome)
    genes.source_g.replace(i, fname, inplace=True)

In [8]:
genes.sort_values("source_g", inplace=True)
genes.to_csv("../data/genes_{}_{}_{}_{}_{}.csv".format(N_GENOMES, GENOME_LEN, N_GENES, GENE_LEN, P_MUTATION), index=False, header=False)

In [9]:
genes.head()

Unnamed: 0,source_g,start_i,full_text,n_mutations
0,../data/genome_5_100000_2000_100_0.2_0.txt,91701,AGCCTTAGTCGAGACCGCATCGGAAGCAGTTGGTATCGTCCTAGGC...,16
523,../data/genome_5_100000_2000_100_0.2_0.txt,85187,CACAGGTCACTCCTAAATGACATACGTCTTTCCGGATGGCATTACT...,16
526,../data/genome_5_100000_2000_100_0.2_0.txt,19825,AATTCGAGATTAAACAGGGGTGTCTGCTGTACGGGAGGCTGGACTT...,17
528,../data/genome_5_100000_2000_100_0.2_0.txt,85885,CGGCGACACGTGCGATACCCGGTCCCTTTCCACCACCCGTTTATAA...,23
1526,../data/genome_5_100000_2000_100_0.2_0.txt,82297,AGCCGGATTGTGCTACGTCAAATGCAAAGCGAGAAAAACCGAGGCC...,13


In [10]:
genes.source_g.value_counts()

../data/genome_5_100000_2000_100_0.2_0.txt    429
../data/genome_5_100000_2000_100_0.2_4.txt    414
../data/genome_5_100000_2000_100_0.2_1.txt    406
../data/genome_5_100000_2000_100_0.2_3.txt    379
../data/genome_5_100000_2000_100_0.2_2.txt    372
Name: source_g, dtype: int64

### Step 4: Evaluate
How often does the longest common substring align with the gene's location?

In [11]:
from difflib import SequenceMatcher

In [14]:
# return predicted start index of gene in genome
matcher = SequenceMatcher(autojunk=False)


def predict_location(gene, source_g):
    matcher.set_seqs(gene, genomes[int(source_g[-5:-4])])
    match = matcher.find_longest_match(0, GENE_LEN, 0, GENOME_LEN)
    # print(f"Length: {match.size}\tgenome_i: {match.b}\tgene_i: {match.a}")
    return match.b - match.a

predict_location(genes.full_text[0], genes.source_g[0])

91701

In [15]:
predicted_location = genes.head(10).apply(lambda row: predict_location(row["full_text"], row["source_g"]), axis=1)

In [16]:
predicted_location == genes.head(10).start_i

0        True
523      True
526     False
528      True
1526     True
531      True
1524     True
536      True
538      True
540      True
dtype: bool

In [17]:
genes.head(10)

Unnamed: 0,source_g,start_i,full_text,n_mutations
0,../data/genome_5_100000_2000_100_0.2_0.txt,91701,AGCCTTAGTCGAGACCGCATCGGAAGCAGTTGGTATCGTCCTAGGC...,16
523,../data/genome_5_100000_2000_100_0.2_0.txt,85187,CACAGGTCACTCCTAAATGACATACGTCTTTCCGGATGGCATTACT...,16
526,../data/genome_5_100000_2000_100_0.2_0.txt,19825,AATTCGAGATTAAACAGGGGTGTCTGCTGTACGGGAGGCTGGACTT...,17
528,../data/genome_5_100000_2000_100_0.2_0.txt,85885,CGGCGACACGTGCGATACCCGGTCCCTTTCCACCACCCGTTTATAA...,23
1526,../data/genome_5_100000_2000_100_0.2_0.txt,82297,AGCCGGATTGTGCTACGTCAAATGCAAAGCGAGAAAAACCGAGGCC...,13
531,../data/genome_5_100000_2000_100_0.2_0.txt,27024,TATTCTTTCACCTGTAATATCGCAGGAGAGGACGTATGCATCCACG...,15
1524,../data/genome_5_100000_2000_100_0.2_0.txt,39795,ATACTCTTTGAGTAGATCCGTAACTGCTTTTCGTGCAATGGCACGG...,17
536,../data/genome_5_100000_2000_100_0.2_0.txt,82929,TTCACCAGCGCCCCACTCAGACAACGGATGGCGGTCGGCCCCCCAG...,19
538,../data/genome_5_100000_2000_100_0.2_0.txt,1703,CGCAGGATTGAAATGCTACCCGATCATACCTGTCCATGGATCTCCA...,15
540,../data/genome_5_100000_2000_100_0.2_0.txt,9761,ACCTGGTTTTAAACACTGTTGACCTTCGGTACACGTATGGACTAGT...,16
