In [1]:
class Profile:

    def __init__(self, k):
        self.matrix = [{'A': 0, 'C': 0, 'G': 0, 'T': 0}.copy() for i in range(k)]
        self.k = k
        self.size = 0

    def add(self, motif):
        self.size += 1
        for i in range(self.k):
            self.matrix[i][motif[i]] += 1
            
    def score(self, motif):
        value = 1.0
        for i in range(self.k):
            value *= self.matrix[i][motif[i]] / self.size
            
        return value
    
    def score_with_pseudocounts(self, motif):
        value = 1.0
        for i in range(self.k):
            value *= (self.matrix[i][motif[i]] + 1) / (self.size + 2)
        
        return value
        

In [2]:
def most_probable(profile, k, string):
    best_score = 0.0
    mp_kmer = string[0:k]
    
    for i in range(len(string)-k+1):
        kmer = string[i:i+k]
        
        if profile.score_with_pseudocounts(kmer) > best_score:
            best_score = profile.score_with_pseudocounts(kmer)
            mp_kmer = kmer
    
    return mp_kmer

In [3]:
def score(motifs):
    k = len(motifs[0])
    p = Profile(k)
    for motif in motifs: p.add(motif)
    
    value = 0
    for i in range(k):
        max_freq = max(p.matrix[i].values())
        cons_letter = 'A'
        for letter in p.matrix[i].keys():
            if p.matrix[i][letter] == max_freq: cons_letter = letter
                
        for j in range(len(motifs)):
            if motifs[j][i] != cons_letter:
                value += 1
                
    return value

In [4]:
def GreedyMotifSearch(dna, k, t):
    bestMotifs = [string[0:k] for string in dna]
    str_len = len(dna[0])
    
    for i in range(str_len-k+1):
        Motifs = []
        Motifs.append(dna[0][i:i+k])
        p = Profile(k)
        
        for j in range(1, t):
            p.add(Motifs[-1])
            Motifs.append(most_probable(p, k, dna[j]))
            
        if score(Motifs) < score(bestMotifs):
            bestMotifs = Motifs
    
    return bestMotifs         

In [13]:
filename = 'rosalind_ba2e.txt'

In [14]:
with open(filename) as file:
    k, t = [int(x) for x in file.readline().split()]
    dna = []
    for line in file:
        dna.append(line.rstrip())

In [15]:
BestMotifs = GreedyMotifSearch(dna, k, t)

In [16]:
for motif in BestMotifs:
    print(motif)

GCACGTAGCACC
CCAGGTGGCCCC
GCACGTCGCTCC
GCAGGTGGCCCC
ACAAGTGGCGCC
CCAAGTGGCGCC
ACATGTAGCGCC
GCACGTGGCCCC
ACAAGTCGCACC
GCAAGTTGCGCC
GCAGGTAGCACC
GCACGTGGCTCC
TCAGGTCGCCCC
TCAGGTGGCACC
GCAAGTGGCCCC
GCATGTTGCTCC
TCAGGTAGCACC
ACATGTCGCTCC
TCATGTGGCCCC
CCAAGTAGCCCC
GCAGGTGGCCCC
GCAAGTTGCTCC
TCAAGTAGCCCC
GCAAGTAGCTCC
ACAGGTAGCCCC
