In [65]:
import numpy as np
from helpers.dna_message_finding_helper import *
import random
from copy import deepcopy

In [183]:
def _randomize_search_iteration(dna_list, k, t):
    k_mers = get_random_k_mers(dna_list, k)
    profile = get_profile_of_pattern(k_mers)
    best_score = score_motifs(k_mers)
    best_motifes = k_mers
    while True:
        profile = get_profile_of_pattern(k_mers)
        for i in range(0, len(k_mers)):
            k_mers[i] = find_most_probable_k_mer(dna_list[i], profile, k)
        new_score = score_motifs(k_mers)
        #print(f"old score: {best_score}, new score: {new_score}")
        if new_score < best_score:
            best_motifes = k_mers
            best_score = new_score
        else:
            return (best_motifes, best_score)

In [184]:
def randomize_motif_search(dna_list, k, t):
    best_motifes = None
    best_score = 100000
    for i in range(0, 400):
        motifes, score = _randomize_search_iteration(dna_list, k, t)
        if score < best_score:
            best_score = score
            best_motifes = motifes
    #print(best_score)
    return best_motifes

In [101]:
def random_k_mer_based_on_prob(k_mers_list, prob_list):
    x = random.choices(prob_list)
    print("X:" + str(x))
    index = 0
    for i in range(0, len(prob_list)):
        print(prob_list[i])
        if x[0] >= prob_list[i]:
            index = i
        else:
            return k_mers_list[index]

In [174]:
def _gibbs_search_iteration(dna_list, k, t, N):
    k_mers = get_random_k_mers(dna_list, k)
    profile = get_profile_of_pattern(k_mers)
    best_score = score_motifs(k_mers)
    best_motifes = deepcopy(k_mers)
    #print("Initial k_mers:" + str(k_mers))
    for i in range(0, N):
        j = random.randint(0, t - 1)
        #print("initial:" + str(k_mers))
        deleted_k_mer = k_mers.pop(j)
        deleted_string = dna_list.pop(j)
        #print("Deleted string:" + deleted_k_mer)
        profile = get_profile_of_pattern(k_mers)
        #print("Profile:" + str(profile))
        deleted_str_patterns = list(find_patterns([deleted_string], k))
        #print("Deleted str patterns:" + str(deleted_str_patterns))
        deleted_str_patterns_probs = [find_pattern_probability(profile, pattern) for pattern in deleted_str_patterns]
        normalized_probs = deleted_str_patterns_probs/sum(deleted_str_patterns_probs)
        #print("Normalized prob:" +  str(normalized_probs))
        random_pattern_from_deleted_string = random.choices(deleted_str_patterns, weights = normalized_probs)[0]
        #print("Random pattern:" +  str(random_pattern_from_deleted_string))
        k_mers.insert(j, random_pattern_from_deleted_string)
        dna_list.insert(j, deleted_string)
        #print("final:" + str(k_mers))
        new_score = score_motifs(k_mers)
        if new_score < best_score:
            best_motifes = deepcopy(k_mers)
            best_score = new_score
            #print("Best motifes:" + str(best_motifes) + " score:" + str(best_score))
    return (best_motifes, best_score)

In [178]:
def gibbs_search(dna_list, k, t, N):
    best_motifes = None
    best_score = 100000
    for i in range(0, 20):
        motifes, score = _gibbs_search_iteration(dna_list, k, t, N)
        if score < best_score:
            best_score = score
            best_motifes = motifes
    #print(best_score)
    return best_motifes

In [180]:
string = "AATGCGTCCTAGGCTCTCTTATACTTTTCTGTAAAGCCTAGTCAGGCCGCTTATGTTTCTGTAACTTGCAGTAGCAATTCCAAAAATTTGAACAGCCTGCAGTCCACGTGGTCTGGCATCTGGACCGGCAGGAATATATCGCACTCGTGGTTCCGGCTTCGACGCGCGGCGTTTCCTATATGCCTTACAACGTATCTCTACAAGAAGCCCACCTTGTCGGATCGGGGGGGTTCGCAAGTTGGTTCCAGAGGTGTAATCCTACACGGATCGATACTCGGCCACCGATCATGGTCCCATTTCGCAAAAATGCGTCCTAGGCT CTCTTATACTTTTCTGTAAAGCCTAGTCAGGCCGCTTATGTTTCTGTAACTTGCAGTAGCAATTCCAAAAATTTGAACAGCCTGCAGTCCACGTGGTCTGGCATCTGGACCGGCAGGAATATATCCGCGGGCGCAGATGCGCACTCGTGGTTCCGGCTTCGACGCGCGGCGTTTCCTATATGCCTTACAACGTATCTCTACAAGAAGCCCACCTTGTCGGATCGGGGGGGTTCGCAAGTTGGTTCCAGAGGTGTAATCCTACACGGATCGATACTCGGCCACCGATCATGGTCCCATTTCGCAAAAATGCGTCCTAGGCT CAATAATTGACCGCAACTATTTAGCAGACGTGAATCGCACAACAGCTACCGCATGATTCCGGTTACGCAGATGTCTAACTGCTACTGATTATGGTTGTCACACGTACAGCACGGAAGTCTGATCTCTATTTCGGTGATAATCTTACTTACGTGGGACTCTATTATATGTCCACGGGTAATATACTCAACGCAGACGGGCAGAGGCGCCTGCACAAACGGCTCGTCTTGGTGAAGCTGCGTCCCAAGTCCGAATAAATCCTATTCCGCATCAATCGAACATCTGTCGCTCGAAAGAGCTGATGCGACCGCGGTGACGGGCG TAAGGGCCCATGAGAGTCGACCCGACGTGTAACCCAGAGGTACGCCCGCTGGTCTTCCCTTGCCCGCTTGAGTACCAGTATATTGCGCGGCCTGGGGTAAAGGAATCGTCCGCTGACAACAGTGTGGTATGCGCATATTACTGCCAAACAATTGAACGTGATCCTCAGGAATACAACACTAAGATCCATATGATGTATATCCAATTCATCTGAAGGCTGTGCCAATCTTAGCGTCTATATCTACGGTTAGAGAGCCAGAGGGTAGAACATAAGGGACTGTCACGACGCCGCCGGAATGACTCGGACGCGCAGAACGGTGA GACGGTATATGGTTTTTAGCTTTTGGCGTCAAACGTAAAGCTTAGGTGATTTCAAAACGTTTTACGAATTACAGATTCTGGTGGGACACGGTGATAAAAGGTAAAGCTGAAGTTTTACGTCTTGGCGCAGGGTGTGCACATCTTCCTGGTTCCGACAGCCATAGAGTCTATATCGCTTAATTCACCAGCGAGTGTCGATCTCCGTGGATCCTATAGGAGCCAACATAGCTGTTCTTGGTATCAGGCCCAGGTTAAAGCAGAAAGAATATAGCGACGTTAGAACCAAGATATGGGACTTACCGGACAGACAGATGAGGCGT CATCCAAAGACTCCATCTTTCCATGCCAAATTTTCCGCCGGCGGATGTAGTTCGCGCGGCTTGTGATAAGATCTTGCTTGACCGATTCTCCGAGATGAGGCAGGTTATGCTGGCTCAAAGGTCGCTGCTTTATGGTTGAGAGTCCTCTACCTATCGATGCCATTGATAACGCCTTATCGTATTTGACACAGGCATCCGACTCCGCGAGATCAAGCTAGCACGCTTGCGAGTGACAGGTTTGGAGATCTGGTACCGGACGCGCAGTCCAATTATGTGTTACGGTCCCGGCAAACCATCTCGACACGCAATTGTGACAAGAG CGGCGACAAGATGCACATGCTGTCTTTGGCCCAGTCCTGATTTTGACATGTCTTGTAAACATCCTGTTCGAGGGTGACATCACATTCAACCGTTTTACGGTGCCCCTTTCCACTATCGAAGCCGGACGCTGCGATGTTCGTCGATAGGGGACGGAGTAGACGCCTTGACAAAGGGATAACTATTCGGTAGCGAAAGGCTAACGAGCGTGCATTGTTTTAGTCTCCAAGAAAGACCATACCCCCGGCCACATGAATTGTGCTTAACAATTGCAGCTATGTGAGTGGCATAATTTCCGCAACCGTCGACTCCGAAAACTATT TTTCTCACCGATCTTGAGCTAAATTCTTATAGTCCTCTTGGCGGGGTGCCGGTACTCTCGATTAGCAGTGAAACAGGGCGGGGATGATCGTGGGTAAATGTAGACTGCTGTCTTCATACTGTACCTTCACCAACTTTCAACCGAGATGTTTCCCCTCAGCGCCATAGTTCACTGCTTGCTCGAAAGCGGCAGCTGCCCCGGGCTCACCTAACGTTCAGCTAGATGCACGATCACTGCTTTATAAAACCTGTCCTCGACGTCATGCTGCGAGCCCGACTGCGCAGATGAGCCCCTGAATCTTCATGCATGGGAACCATATC TTCAGCGGTGCACAATTCGGCCGAAGAGCATCATACGAGCAATAGGTGCGGAACGATCCGGTTCTATTTAGGTAAGATATATTATTGGCTGAAGCCCCCCTGGGTAAGCGTGCTTGACTGCACTCTACGCGCAGATGGCCTAAGGCGAATCGATTTAAGAAGCATCTCGGGCGGATGTGGTTATAACTAATCTGTCCCTCCTATCATGCCCGAACTGCCCCCACTAATCCTGTCAGGCCGCAATGGTGGTACAACGTGCGTCACTAGATGATAACCCCGTCGAGAGTCGATCCCCAGAAAGTTAAACGGCACAGTCCCTC TTAACGATCAATTTTACAAGTGTAATGGTTATTAGACGTCCTTCTCATAGGCCAATATAGTAGACCAACTTCTGTACGACACCTTCACGGGGGGGTACGGAAGGAACCGGCTGAAGCTCATAGCAATCTTGTGTACGGAGAACGTGCTCTGCAGGGAGTCATGAGATATATAGCTAAAATAGTGGCAGCTGTCAAAAAATAGACAGTCCGCTCGGGTAACAAGGTGATGCGTACATGGTAGCAAGACATCTGCTGCACAGAACGACGCGCAGATGCCGTTAGAGTGTCGTGCTCGGCACAGTCGCGTACACACTTGAAAA CTGATAGCTGTTTTATGCGGGCCCGGCCCGAGGGGTCGGAATGCATGCAAGTTCGCCACGACCGGGGCCGCAGATGTACATGGGCGCGATTAGGCACCAGATCCGCTTTGACACGCTCCTGAGACACGACGCTGTCCAAAGTGTCCTGTGAGCTAAGCGCCGATCCTGCTGGCTACATTCAACGAAGCCTCTACGTACGTATTAGTCGGACAATGACGCCGGCGCCTATACAACACAGGTGCGGATGCGTCGTCGCAGTAGACTCTTTCTTTGCGCGCAATTGTATTGTGGCGTGCTCTGGACCCATATATTGGGTGCAT TGCGTTAATACGGTCTTGGATTCACCGTTCGCTTTAACTACCAAAGACTGATGTGAGGTGCCTCGGAGAACGAGTCAGGCGACCTACTATTGGCGCGCTCCCCTCCTAGAGGCAACTCATCCTCTTTGATTGCGGCATAGGAATTCGAATATCTGCGCGCTCTATGCGCGGTAAATGAATGTCCGAATCCGGGTTATGTCTTCCTTGAGTCTAGGGGCACTTAATCGGAGAATGCCCACTAGCCCCGGACGGCAAGATGGGTCTTGCACACGATTGAGATCAGTGTAGATGTTCGCGGTGTCACCCCCTTTATTGCAGTA GCACTATGTTACCAGGAGGTCTGGACGCGCTCACTCCCCTAATTGCGCTCTTGGCCTCGTCATCTCGCACCCGCACATCTCAACAGCTACAAATTAGTGTCATGCCCGACGCTAAGATCCCGTTCCTAACTCTCAGTAGAAGATCGAGAGTTCATCTGGGTATTCTTTCTTGTAAACGCTGGCGGGTATCGGCTACTCTGGGGTTACGCTCGCCCTCAGGGCTATTAATAACCTGGTTGTCGAGGATCGTTGAAGATGCCGCACTAATCGCTAAGCATAGGCCCGGAATAGCAGATGCGGGACTCCATAATGGCGGACAG TATGAGCGTTGGTACCAGCTCTTTTACGTTCAATATGTTATTCCTGCTCTGAGCGATTTCTGCCTAGTAGTGGCAAATGAAAGGCAATAAGGAAAAATAGGAACCGGTTAACCAGAACTCCGTTAGCTTGTCGAGACGCGCAGAGCTTCCAGACATGTCTCTCCGCCGCGGCAGAACCGCCGGACTTCCAGATGCGTGGGAGCCCAATTCAAAGTAGTACATACCGCACTCCGGGGAGGACTGGGACTTTGTCCACCGATCATATAAGTCTCTAGCGCGATGCCGTATCTTCTGCCTAGAATGCCAATGTTTACCTCCTT AGCGCATTGAACACTGGATCTTCTCAATTATGAGTTTGTATTAAGTTTATTGAGGAAAGACCAGTCAGAAACATAAGTCCTACCGGGTGTTCCGCTCCAGGCCCCACTGCCTTACTCAGCCTTGCATCAGGCGACTCAACGCTGATACATCACCGTCGTCCTGCCCCCTCGCGCAGATGGTTCCTGTCCGGAATAACACAATCCGACCTTGAGTACTCTTTGGGCGAGGTGGCTAGGGCATGTGTGCGAAGCGGACTGTTATCGGTCTTTCATTGGGAGGGTGAGTCACCCGGTCATTGCTGGGTAAAACTGACAGAACC TACACAGGAAACTATAGCTACGTCAGGCCCGGTCACCGGAATTCTAAGGGGGGCAGTTCGTGATAGGGATCTAATTCATTCAAGTAGCAGCTACACTTTCTAGCCGGACGCGGCTATGAACCCCTGAGAACCCGCGATAGGCCAGGCAACCTTGAGTGGATAAAACTATGCGCACCGTTAATTTCTCTGTGTCACAGACCCGAAAAGCAAGGATTCGCGTCCTTATTGACTGCGATGACCGGTTCTTGATGTATTCTATCGGGTAAGGGAGGCTATTCCGAAACGATCTACTTTATCTATCTCCGAAGTCTACAATCAGG GCACTACGAGATGAGGGTCGCCGTACGGAGTCTTCCGTTCCAATCCTTGGTCCATTGACTAGAGCAGCTATGGGTGGCGGACGGGAGTACGCGCATGGCTTAGTACGATTAAGCTGCCAAGCTCGATCGCTCACTCGGTTATACATAGCTCCCACCCGTTCGTCAGCGTAAGAGCTTATAACATGTGGATCTTCAGTACCCAAGGACGCGCAGATCTCTGGCAATCAGCCGAGGCAGCTCCGCCTTGCTAAGACCTCCTTCCATTACGGTGACATTCCTTGGATTACCAATCAGAACCAGTGTGGACGTGGATCTAATAG ACGATGTCCACCATTAATAAGGTCGCCCATCGCCAAGGCACGACTCACATTAAACGCATTTTCGCGCATCGCCGAGCACGTTAAAGCCGCGTTTGCACAGTGCGATAATCGTATGAAATATGCGTTGATGATTGCCAGTGGTGTAACAATGTCCTGCAGCGAGACATCACCACCTAGAGTAAAAGGTTGTCAGCCTTTTGGGGAGCCCGGACGCGCACGAGGCTCCGCTGGCCACTGTCCTCTACATTTTACTCTCCTATTGAAACGAAGTGCATTTCGGTTTCTTGTAAGAGTGCTAGTGAGCCATGCCTTAGGAAGCA CCCTTGAGAACGCTACCGTCGCTGTAACGCAATAATGAGGTACCGGGTCTTAGGTCTGGGTATGCAAACATGAAGTAAGAGGCAATAGGGCGCAGCAGGATGCAGGCGGGCTTTCCATCGCTGAGCATTTCTTACACAGTCCCCTTCTGTACCAAACCCCGTCACATTTTCTGCCGGGCTGGTTGCAAAGTATAAAGCGAGTTGAAAAAATATAACCTTCGCCCTTGGTCTTGGCGGACCGGACGCGCGCTTGTAACGGCTCAGTTTGGAAGAACTCGATATTGGACGCATAAACGGAGAGGTCGAACGCATCCTTCCTG AGATTACCCGTCTATGGTGCTTAGGCGCCCTCTATAGAGCTGACTTACCAAAGACTAACTAGGTTGATTAGAAATGCCCGGCGTTCACCGATTTTGCCCCCGAAATCTAACTTTATCCGGCCGTATAAGCCGTGTACATGTTGGTAATGAAAGCAGTGACGGCACTACAGGGGGCATAATCGAGGGTACGGGGAACCCTGAAGTGCTGTACGTAGAGCTGATACCTCTGGTCATCGGTTCGCTACGACATCCCGGAGAAGCAGATGTAGCGTTTCTGCGTGTCTTCAAGAAGCCGTCTTTGTGAAATCACGGGATTGGGG"
motifes = motif_search_per_iteration(string.split(" "), 15, 20, 2000) #TCTCGGGG CCAAGGTG TACAGGCG TTCAGGTG TCCACGTG
" ".join(motifes)

'CTGGACCGGCAGGAA CCGCGGGCGCAGATG CCGGTTACGCAGATG TCGGACGCGCAGAAC CCGGACAGACAGATG CCGGACGCGCAGTCC CCGGACGCTGCGATG CCGACTGCGCAGATG CTCTACGCGCAGATG AACGACGCGCAGATG CCGGGGCCGCAGATG CCGGACGGCAAGATG CCGGAATAGCAGATG CCGGACTTCCAGATG CCCCTCGCGCAGATG CCGGACGCGGCTATG AAGGACGCGCAGATC CCGGACGCGCACGAG CCGGACGCGCGCTTG CCGGAGAAGCAGATG'