### Step 1: Set Parameters

In [1]:
import numpy as np
import pandas as pd

In [2]:
N_GENOMES = 5
# GENOME_LEN = 100000
GENOME_LEN = 10000

N_GENES = 100
GENE_LEN = 500
P_MUTATION = 0.1

### Step 2: Simulate Genomes

In [3]:
# For now, keep genomes as a string
# This will make simulating the genes more convenient due to Numpy's indexing
genomes = np.random.choice(['A', 'C', 'G', 'T'], (N_GENOMES,GENOME_LEN))

### Step 3: Simulate Genes
For each gene, save the text of the mutated gene, index of the source genome, the start index within the genome, and the number of mutations applied

In [4]:
source_g = np.random.choice(N_GENOMES, N_GENES)
start_i = np.random.choice(GENOME_LEN - GENE_LEN, N_GENES)

genes = pd.DataFrame({"source_g": source_g, "start_i": start_i})
print(genes.head())

   source_g  start_i
0         3     8143
1         0     8408
2         4     6007
3         1     4748
4         1     6544


In [5]:
CHOICES = {
    'A': ['C', 'G', 'T'],
    'C': ['A', 'G', 'T'],
    'G': ['A', 'C', 'T'],
    'T': ['A', 'C', 'G'],
}

def mutate(source_g, start_i, genomes):
    gene = (genomes[source_g, start_i:start_i + GENE_LEN]).copy()
    n_mutations = np.random.binomial(GENE_LEN, P_MUTATION)
    mutation_indices = np.random.choice(GENE_LEN, n_mutations)
    for i in mutation_indices:
        gene[i] = np.random.choice(CHOICES[gene[i]])

    mutations = GENE_LEN - (genomes[source_g, start_i:start_i + GENE_LEN] == gene).sum()
    
    return "".join(gene), mutations
    # return gene, mutations

In [6]:
genes["raw_mutation_result"] = genes.apply(lambda row: mutate(row[0], row[1], genomes), axis=1)
genes["full_text"] = genes["raw_mutation_result"].apply(lambda x: x[0])
genes["n_mutations"] = genes["raw_mutation_result"].apply(lambda x: x[1])
genes.drop("raw_mutation_result", axis=1, inplace=True)

# Now that we're done simulating genes, join genome as a string
genomes = np.apply_along_axis(lambda row: "".join(row),1,genomes)

In [33]:
for i, genome in enumerate(genomes):
    fname = "../data/genome_{}_{}_{}_{}_{}_{}.txt".format(N_GENOMES, GENOME_LEN, N_GENES, GENE_LEN, P_MUTATION, i)
    with open(fname, "w") as f:
        f.write(genome)
    genes.source_g.replace(i, fname, inplace=True)

In [34]:
genes.sort_values("source_g", inplace=True)
genes.to_csv("../data/genes_{}_{}_{}_{}_{}.csv".format(N_GENOMES, GENOME_LEN, N_GENES, GENE_LEN, P_MUTATION), index=False, header=False)

In [35]:
genes.head()

Unnamed: 0,source_g,start_i,full_text,n_mutations
35,../data/genome_5_10000_100_500_0.1_0.txt,1228,AACGGCAAAGACAGTCCGTAATTAGGAATCTCAGTGGAAATCACGC...,50
48,../data/genome_5_10000_100_500_0.1_0.txt,9058,CACCCCCCATGGCCTGACTAAGCGCTTGGAGTGTGCAGACGGGATG...,48
56,../data/genome_5_10000_100_500_0.1_0.txt,9003,TGAATTGTGCTTCTTTACGGATTACGATTAGCATTCACGAACGATG...,45
52,../data/genome_5_10000_100_500_0.1_0.txt,4630,AATTCCTTGTACATAAGGGTTATCTCCTCCACCTACACTGGTGGAT...,45
20,../data/genome_5_10000_100_500_0.1_0.txt,3364,GCACCTTATAGCCAACACTCCGTCATGGATTTAGACTTACGGACTC...,54


In [10]:
genes.source_g.value_counts()

0    25
2    23
3    19
1    19
4    14
Name: source_g, dtype: int64

### Step 4: Evaluate
How often does the longest common substring align with the gene's location?

In [11]:
from difflib import SequenceMatcher

In [12]:
# return predicted start index of gene in genome
matcher = SequenceMatcher(autojunk=False)


def predict_location(gene, source_g):
    matcher.set_seqs(gene, genomes[source_g])
    match = matcher.find_longest_match(0, GENE_LEN, 0, GENOME_LEN)
    # print(f"Length: {match.size}\tgenome_i: {match.b}\tgene_i: {match.a}")
    return match.b - match.a

predict_location(genes.full_text[0], genes.source_g[0])

8143

In [13]:
predicted_location = genes.head(10).apply(lambda row: predict_location(row["full_text"], row["source_g"]), axis=1)

In [14]:
predicted_location == genes.head(10).start_i

35    True
21    True
22    True
32    True
72    True
43    True
48    True
79    True
20    True
52    True
dtype: bool

In [15]:
genes.head(10)

Unnamed: 0,source_g,start_i,full_text,n_mutations
35,0,1228,AACGGCAAAGACAGTCCGTAATTAGGAATCTCAGTGGAAATCACGC...,50
21,0,5896,CCACGAAATATTGACCCTGCAAGTCAATTTCTGATACGTCGCCGTA...,42
22,0,8262,TTAGTGCATGTTTGGAACGCGAGTGGAGATCCTTTTGGTCCTCGAC...,41
32,0,4378,ATCCAGAGGCGTGTTCGTTGAGTACGTCTTTGGGTCGCGCCAGTGG...,48
72,0,6391,CATGTGTCTAAGTTGTACTGCGGCAAACACGGCCGTCGAGGCCATG...,43
43,0,3287,CCCTCTTACAAGGCCCATAACCGAGGGTGCAACATCTGCCTTGAGA...,38
48,0,9058,CACCCCCCATGGCCTGACTAAGCGCTTGGAGTGTGCAGACGGGATG...,48
79,0,8932,GAAACGTCGACAACGGGCAACCTAAGATCTGACCGATTTTTAAGGC...,42
20,0,3364,GCACCTTATAGCCAACACTCCGTCATGGATTTAGACTTACGGACTC...,54
52,0,4630,AATTCCTTGTACATAAGGGTTATCTCCTCCACCTACACTGGTGGAT...,45
