In [232]:
class Profile:

    def __init__(self, k, motifs = []):
        self.matrix = [{'A': 0, 'C': 0, 'G': 0, 'T': 0}.copy() for i in range(k)]
        self.k = k
        self.size = len(motifs)
        for motif in motifs:
            for i in range(self.k):
                self.matrix[i][motif[i]] += 1

    def add(self, motif):
        self.size += 1
        for i in range(self.k):
            self.matrix[i][motif[i]] += 1
                      
    def score(self, motif):
        value = 1.0
        for i in range(self.k):
            value *= (self.matrix[i][motif[i]] + 1) / (self.size + 2)
            
        return value


In [233]:
def most_probable(profile, string):
    best_score = 0.0
    k = profile.k
    mp_kmer = string[0:k]
    
    for i in range(len(string)-k+1):
        kmer = string[i:i+k]
        
        if profile.score(kmer) > best_score:
            best_score = profile.score(kmer)
            mp_kmer = kmer
    
    return mp_kmer

In [234]:
def score(motifs):
    k = len(motifs[0])
    p = Profile(k, motifs)
    
    value = 0
    for i in range(k):
        max_freq = max(p.matrix[i].values())
        cons_letter = 'A'
        for letter in p.matrix[i].keys():
            if p.matrix[i][letter] == max_freq: cons_letter = letter
                
        for j in range(len(motifs)):
            if motifs[j][i] != cons_letter:
                value += 1
                
    return value

In [235]:
def make_motifs(profile, dna):
    motifs = []
    for string in dna:
        motifs.append(most_probable(profile, string))
        
    return motifs       

In [236]:
from random import randint

def random_motif(string, k):
    pos = randint(0, len(string)-k)
    
    return string[pos:pos+k]

In [237]:
def RandomizedMotifSearch(dna, k, t):
    Motifs = [random_motif(string, k) for string in dna]
    bestMotifs = Motifs
    
    while True:
        p = Profile(k, Motifs)
        Motifs = make_motifs(p, dna)
        
        if score(Motifs) < score(bestMotifs):
            bestMotifs = Motifs
        else:
            return bestMotifs

In [275]:
filename = 'rosalind_ba2f.txt'

In [276]:
with open(filename) as file:
    k, t = [int(x) for x in file.readline().split()]
    dna = []
    for line in file:
        dna.append(line.rstrip())

In [277]:
from tqdm import tqdm

In [278]:
bestMotifs = RandomizedMotifSearch(dna, k, t)

for i in tqdm(range(999)):
    motifs = RandomizedMotifSearch(dna, k, t)
    if score(motifs) < score(bestMotifs):
        bestMotifs = motifs

100%|██████████| 999/999 [00:46<00:00, 21.36it/s]


In [279]:
for motif in bestMotifs:
    print(motif)

CCTGCCTTAACGACC
TCGGAGTTAAAGTCC
TCGTTCTTAAGTACC
TCGTTTCGAAAGTCC
TCGTTCCCTAAGTCC
TCGTTCTTAAAGAAT
TCGTAAGTAAAGTCC
CAGTTCTTAAAGTCG
TCGGGTTTAAAGTCC
TCGTTCTTATCATCC
TCGTTCACCAAGTCC
TCGTTACAAAAGTCC
TCGTTCTCTGAGTCC
TCGTCAATAAAGTCC
TTTGTCTTAAAGTCC
TCGTTCTTAAAACGC
TCTAACTTAAAGTCC
GATTTCTTAAAGTCC
TCGTTCTTGTCGTCC
ACGTTCTTAAAGTTA
