In [5]:
import sys
import random
import math

In [6]:
def randomized_motif_search(dna: list[str], k: int, t: int) -> list[str]:
    finalScore = 1000000000000000000000000000.0
    for round in range(1000):

        # Initialization: retrieves random motifs, its profile, and its score
        motifs = []
        for seq in dna:
            motifs.append(random_motif(seq, k))
        bestMotifs = motifs

        profile = create_profile(bestMotifs, k, t)
    
        bestsScore = calculate_set_score(bestMotifs)
        breakOut = False

        # Iterates till best score is attained, aka when score doesn't change
        while breakOut == False:   
            profile = create_profile(bestMotifs, k, t)

            # Find highest scoring motif:
            motifs = find_motifs(profile, dna, k)
            motifsScore = calculate_set_score(motifs)

            # Checks to see if the score is larger
            if motifsScore < bestsScore: ###
                bestMotifs = motifs
                bestsScore = motifsScore
            else:
                breakOut = True

        # Sets final motif is score is larger than past final score
        if bestsScore < finalScore:
            finalMotif = bestMotifs
            finalScore = bestsScore
    
    return finalMotif

def random_motif(seq, k):
    start = random.randint(0, len(seq)-k)
    kmer = seq[start:start+k]
    return kmer

def create_profile(motifs, k, t):
    profile = []
    for i in range(k):
        countDict = {"A":1, "T":1, "C":1, "G":1}
        for motif in motifs:
            nuc = motif[i]
            countDict[nuc] += 1
        countDict = {key: value / (sum(countDict.values())) for key, value in countDict.items()}
        profile.append(countDict)
    return profile


def find_motifs(profile, dna, k):
    motifs = []
    for seq in dna:
        text_length = len(seq)
        bestKmer = ""
        bestScore = 0.0
        for i in range(text_length+1-k):
            kmer = seq[i:i+k]
            score = calculate_score(kmer, profile)
            if score > bestScore:
                bestScore = score 
                bestKmer = kmer
        motifs.append(bestKmer)
    return motifs

def calculate_score(motif, profile):
    score = 1.0
    for i in range(len(motif)):
        nuc = motif[i]
        score *= profile[i][nuc]
    return score

def calculate_set_score(motifs):
    nucCount = []
    for i in range(len(motifs[0])):
        dict = {"A":0, "T":0, "C":0, "G":0}
        for j in range(len(motifs)):
            dict[motifs[j][i]] += 1
        nucCount.append(dict)

    consensus = 0.0
    for i in nucCount:
        nuc = max(i, key=i.get)
        consensus += (sum(i.values()) - i[nuc])
    return consensus


In [7]:
k = 8 
t = 5
dna = ["CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA", "GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG", "TAGTACCGAGACCGAAAGAAGTATACAGGCGT", "TAGATCAAGTTTCAGGTGCACGTCGGTGAACCAA", "TCCACCAGCTCCACGTGCAATGTTGGCCTA"]
myMotifs = randomized_motif_search(dna, k, t)

correctMotifs = ["TCTCGGGG", "CCAAGGTG", "TACAGGCG", "TTCAGGTG", "TCCACGTG"]
print("Correct Motifs: ", correctMotifs)
print("My Motifs: ", myMotifs)

correctScore = calculate_set_score(correctMotifs)
myScore = calculate_set_score(myMotifs)
print("Correct Score: ", correctScore)
print("My Score: ", myScore)

Correct Motifs:  ['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG']
My Motifs:  ['AACGGCCA', 'AAGTGCCA', 'TAGTACCG', 'AAGTTTCA', 'ACGTGCAA']
Correct Score:  9.0
My Score:  9.0
