# Implement RandomizedMotifSearch

[ba2f](https://rosalind.info/problems/ba2f/)

    RANDOMIZEDMOTIFSEARCH(Dna, k, t)
        randomly select k-mers Motifs = (Motif1, …, Motift) in each string
            from Dna
        BestMotifs ← Motifs
        while forever
            Profile ← Profile(Motifs)
            Motifs ← Motifs(Profile, Dna)
            if Score(Motifs) < Score(BestMotifs)
                BestMotifs ← Motifs
            else
                return BestMotifs

_Given_: Positive integers k and t, followed by a collection of strings Dna.

_Return_: A collection BestMotifs resulting from running RandomizedMotifSearch(Dna, k, t) 1000 times. Remember to use pseudocounts!

In [214]:
from random import randint
import numpy as np

In [215]:
def iterate_randomized_motif_search(dna, k ,t):
    best_motifs = get_random_motif(dnas, k)
    for i in range(1001):
        motifs = randomized_motif_search(dna, k, t)
        if get_score(motifs) < get_score(best_motifs):
            best_motifs = motifs
    return best_motifs

In [216]:
def randomized_motif_search(dnas,k,t):
    motifs = get_random_motif(dnas, k)
    best_motifs = motifs
    while True:
        profile = get_profile(motifs)
        motifs = get_motifs(profile, dnas, k)
        if get_score(motifs) < get_score(best_motifs):
            best_motifs = motifs
        else:
            return best_motifs

In [217]:
# checked
def get_random_motif(dnas, k):
    motifs = []
    for dna in dnas:
        i = randint(0, len(dna) - k)
        motifs.append(dna[i: i + k])
    return motifs

In [218]:
def hamming_distance(s1, s2):
    return sum([x != y for x, y in zip(s1, s2)])

In [219]:
# checked
def get_profile(motifs):
    columns = [''.join(seq) for seq in zip(*motifs)]
    m = np.array([[(col.count(nucl)+1)/(2*len(col)) for nucl in 'ACGT'] for col in columns])
    return m


In [220]:
# checked
def get_motifs(profile, dnas, k):
    dprofile = {nucl: profile.T[id] for id, nucl in enumerate('ACGT', start = 0)}
    motif = []
    for dna in dnas:
        max_score = 0
        for i in range(0, len(dna) - k + 1):
            cur_score = np.prod([dprofile[nucl][id] for id, nucl in enumerate(dna[i:i+k])])
            if cur_score > max_score:
                max_score = cur_score
                max_motif = dna[i:i+k]
        motif.append(max_motif)
    return motif

In [221]:
# checked
def get_score(motifs):
    consensus = ['ACGT'[max(range(len(a)), key = lambda x: a[x])] for a in get_profile(motifs)]
    score = sum([hamming_distance(motif, consensus) for motif in motifs])
    return score

In [229]:
file = "RandomizedMotifSearch/inputs/rosalind_ba2f.txt" 
with open(file, 'r') as f:
    lines = f.readlines()
    k,t = map(int, lines[0].split())
    print(k, t)
    dnas = [line.strip() for line in  lines[1:]]
    # dnas = lines[1].split()
    print(dnas)
    # print(randomized_motif_search(dnas,k,t))
    print("\n".join(iterate_randomized_motif_search(dnas,k,t)))

# TCTCGGGG CCAAGGTG TACAGGCG TTCAGGTG TCCACGTG
# CGATAA GGTTAA GGTATA GGTTAA GGTTAC GGTTAA GGCCAA GGTTAA
# TTAACC ATAACT TTAACC TGAAGT TTAACC TTAAGC TTAACC TGAACA
# CATGGGGAAAACTGA CCTCTCGATCACCGA CCTATAGATCACCGA CCGATTGATCACCGA CCTTGTGCAGACCGA CCTTGCCTTCACCGA CCTTGTTGCCACCGA ACTTGTGATCACCTT CCTTGTGATCAATTA CCTTGTGATCTGTGA CCTTGTGATCACTCC AACTGTGATCACCGA CCTTAGTATCACCGA CCTTGTGAAATCCGA CCTTGTCGCCACCGA TGTTGTGATCACCGC CACCGTGATCACCGA CCTTGGTTTCACCGA CCTTTGCATCACCGA CCTTGTGATTTACGA
# TCTCGGGG CCAAGGTG TACAGGCG TTCAGGTG TCCACGTG

15 20
['ACGCACTAGGAAGCTACTGCTAACATGTATAACCTTAACCTGGAGTTGTATTGGGGACTAGACGGGGCAATTACTAGTTATGTGACTAAGGGACGTTGGAGCCGTCAATTACGATTGCAAGGCGGCCGCGTCTGACTGGCTTTGTGTGTTCAGCGTGGCCTCCTTGTTATAAGCATCTACCCTAGTTTATGTAGACGCACTAGGAAGCT', 'ACTGCTAACGAAACTTTGATCTTCATGTATAACCTTAACCTGGAGTTGTATTGGGGACTAGACGGGGCAATTACTAGTTATGTGACTAAGGGACGTTGGAGCCGTCAATTACGATTGCAAGGCGGCCGCGTCTGACTGGCTTTGTGTGTTCAGCGTGGCCTCCTTGTTATAAGCATCTACCCTAGTTTATGTAGACGCACTAGGAAGCT', 'ATGACTCGAAGAGATTGAACCCACAGCATGATAGTTTCCCTGCCTCGTCGGGGAACCCTGCAAATTTATTCCAGCGACACCTTGATCTTGTACGTAACTCACAGTTTATCTAGTCAGAAGATCCCTTGTGCATGTAGCGGTCTACTACAGGCTCGCTGCGGGCCTTGCCAGTTAAATCTTATCCTCCACAAAACTGCCGGAAGGGCCGC', 'GGGCGGGTAGTGTAGAGATTAAGCAGTACCGCTAACCTGTGCGAGCCGTTCGCGAAACTCCTTAGCGTCCTACCTATAGCAAAGAATTGGTGGGAAATTGCCCAGATACCAACGCTGGGCCATGCGACCATTTGACTCTGCATTAACTCTCTCGTAGGCTTATAGGAAGTGACTCTTCCCCCCTCTTATCTGATTCCCCCACCATATTA', 'AAATTCTCCGAAAGTTTGATCTTACATTCTAAAGCTAGCTAAACGAGGTACACGTCGCACTGTGACGGTAATACGCCAATATCCCCTGCCTACACCAAGTGGACTCCGAGGCCGAGTCTACCTAAATACAGACCTTGGGA