In [None]:
#https://github.com/atorang/replication-origin/blob/master/bioinformatics1.py

In [None]:
#count frequency of a pattern in DNA string
def pattern(text, pattern):
    results = 0
    sub_len = len(pattern)
    for i in range(len(text)):
        if text[i:i+sub_len] == pattern:
            results += 1
    return results

In [None]:
#most frequency k-mers in DNA string
def frequency(text, k):
    frequent_patterns = []
    for i in range(len(text)-k + 1):
        pat = text[i:i+k]
        count = pattern(text,pat)
        frequent_patterns.append((pat,count))
    frequent_patterns = list(set(frequent_patterns))
    frequent_patterns.sort(key=lambda tup: tup[1], reverse=True)
    max_count = max(frequent_patterns,key=itemgetter(1))[1] 
    pattern_return = [s[0] for s in frequent_patterns if s[1] == max_count] 
    return pattern_return

In [None]:
# Find reverse complement of DNA string
def reverse(seq):
    """Returns a reversed string"""
    return seq[::-1]


def complement(seq):
    """Returns a complement DNA sequence"""
    complement_dict = {'A': 'T', 'C': 'G', 'T': 'A', 'G': 'C'}
    seq_list = list(seq)
    # I can think of 3 ways to do this but first is faster I think ???
    # first list comprehension
    seq_list = [complement_dict[base] for base in seq_list]
    # second complicated lambda
    # seq_list = list(map(lambda base: complement_dict[base], seq_list))
    # third easy for loop
    # for i in range(len(seq_list)):
    #    seq_list[i] = complement_dict[seq_list[i]]
    return ''.join(seq_list)


def reverse_complement(seq):
    """"Returns a reverse complement DNA sequence"""
    seq = reverse(seq)
    seq = complement(seq)
    return seq

In [None]:
# find start positions of pattern in a string
def pattern_start(text, pattern):
    results = []
    sub_len = len(pattern)
    for i in range(len(text)):
        if text[i:i+sub_len] == pattern:
            results.append(i)
    return results

In [None]:
# find most frequent patterns within a clump in DNA string with frequency above certain threshold
def frequency(text, k):
    frequent_patterns = []
    for i in range(len(text)-k + 1):
        pat = text[i:i+k]
        count = pattern(text,pat)
        frequent_patterns.append((pat,count))
    frequent_patterns = list(set(frequent_patterns))
    frequent_patterns.sort(key=lambda tup: tup[1], reverse=True)
    max_count = max(frequent_patterns,key=itemgetter(1))[1] 
    pattern_return = [s for s in frequent_patterns if s[1] == max_count] 
    return pattern_return

In [None]:
def ClumpFinding(text, k, t, L):
    new_patterns = []
    for i in range(len(text) - L + 1):
        frequent_patterns = []
        new_text = text[i:i + L]
        for j in range(len(new_text) - k + 1):
            pat = new_text[j:j+k]
            
            count = pattern(new_text,pat)         
            frequent_patterns.append((pat,count))
        frequent_patterns = list(set(frequent_patterns))
        pattern_return = [s for s in frequent_patterns if s[1] >= t]
        for l in pattern_return:
            new_patterns.append(l)
    
    new_patterns = list(set(new_patterns))
    new_patterns = [to for to in new_patterns if to[1] >= t]
    return new_patterns

In [None]:
# Return skews over entire length of DNA from O - len(genome)
def gcdiff(text):
    skew = 0
    skew_list = [0]
    minimum = skew
    for i in range(len(text)):
        if text[i] == 'G':
            skew += 1
        elif text[i] == 'C':
            skew -= 1 
        skew_list.append(skew)
    return skew_list

In [None]:
# Return index of DNA string with min skew
def skew_min(text):
    skew = 0
    min_skew = 0
    skew_list = [0]
    for i in range(len(text)):
        
        if text[i] == 'G':
            skew += 1
        elif text[i] == 'C':
            skew -= 1 
        
        skew_list.append(skew)
        
    min_skew = min(skew_list)
    index_list = [i for i, e in enumerate(skew_list) if e == min_skew]
    print(index_list)

In [None]:
#get hamming distance between 2 DNA strings of equal length
def hammingd(text1, text2):
    ham_dis = 0
    for i in range(len(text1)):
        if text1[i] == text2[i]:
            ham_dis = ham_dis
        elif text1[i] != text2[i]:
            ham_dis += 1
    return ham_dis

In [None]:
#find pattern that matches DNA string with mismatch
def mismatch_pattern(text, pattern, mismatch):
    results = []
    sub_len = len(pattern)
    for i in range(len(text) - sub_len + 1):
        hamd = hammingd(text[i:i+sub_len],pattern)
        if hamd <= mismatch:
            results.append(i) 
    string = ' '.join([str(elem) for elem in results])
    return string

# returns index of matched pattern with a mismatch in a DNA string
def mismatch_pattern_return(text, pattern, mismatch):
    results = []
    sub_len = len(pattern)
    for i in range(len(text) - sub_len + 1):
        hamd = hammingd(text[i:i+sub_len],pattern)
        if hamd <= mismatch:
            results.append(i) 
    string = ' '.join([str(elem) for elem in results])
    return string

In [None]:
#count number of times pattern occurs in text with slight mismatch
def count_hamd(text, pattern, mismatch):
    results = 0
    sub_len = len(pattern)
    for i in range(len(text) - sub_len + 1):
        hamd = hammingd(text[i:i+sub_len],pattern)
        if hamd <= mismatch:
            results += 1
    return results

#count highest frequency pattern that occurs in text with slight mismatch
def frequency_hamd(text, k, d):
    frequent_patterns = []
    for i in range(len(text)-k + 1):
        pat = text[i:i+k]
        count = count_hamd(text, pat, d)
        frequent_patterns.append((pat,count))
    frequent_patterns = list(set(frequent_patterns))
    frequent_patterns.sort(key=lambda tup: tup[1], reverse=True)
    max_count = max(frequent_patterns,key=itemgetter(1))[1] 
    pattern_return = [s for s in frequent_patterns if s[1] == max_count] 
    
    return pattern_return
   

In [None]:
# get most frequent k-mers with upto d-mismatches
from itertools import combinations, product

def generate(s, d):
    N = len(s)
    letters = 'ACGT'
    pool = list(s)

    for indices in combinations(range(N), d):
        for replacements in product(letters, repeat=d):
            skip = False
            for i, a in zip(indices, replacements):
                if pool[i] == a: skip = True
            if skip: continue

            keys = dict(zip(indices, replacements))
            yield ''.join([pool[i] if i not in indices else keys[i] 
                           for i in range(N)])
            
def frequency_hamd(text, k, d):
    frequent_patterns = []
    pattern_combinations = []
    final_patterns = []
    for i in range(len(text)-k + 1):
        pat = text[i:i+k]
        pattern_combinations.append(pat)
        temp = list(generate(pat, d))
        for p in temp:
            pattern_combinations.append(p)
    pattern_combinations = list(set(pattern_combinations))
    for t in pattern_combinations:
        count = count_hamd(text, t, d)
        frequent_patterns.append((t,count))     
    frequent_patterns = list(set(frequent_patterns))
    frequent_patterns.sort(key=lambda tup: tup[1], reverse=True)
    max_count = max(frequent_patterns,key=itemgetter(1))[1] 
    pattern_return = [s[0] for s in frequent_patterns if s[1] == max_count] 
    return pattern_return

In [None]:
#match k-mer patterns in DNA with upto d-mismatches with reverse complements

def count_hamd_reverse(text, pattern, mismatch):
    results = 0
    sub_len = len(pattern)
    for i in range(len(text) - sub_len + 1):
        hamd = hammingd(text[i:i+sub_len],pattern)
        hamd_r = hammingd(text[i:i+sub_len],reverse_complement(pattern))
        if hamd <= mismatch:
            results += 1
        if hamd_r <= mismatch:
            results += 1
    return results

def frequency_hamd_reverse(text, k, d):
    frequent_patterns = []
    pattern_combinations = []
    final_patterns = []
    for i in range(len(text)-k + 1):
        pat = text[i:i+k]
        pattern_combinations.append(pat)
        pattern_combinations.append(reverse_complement(pat))
        temp = list(generate(pat, d))
        for p in temp:
            pattern_combinations.append(p)
            pattern_combinations.append(reverse_complement(p))
    pattern_combinations = list(set(pattern_combinations))
    for t in pattern_combinations:
        count = count_hamd_reverse(text, t, d)
        frequent_patterns.append((t,count))     
    frequent_patterns = list(set(frequent_patterns))
    frequent_patterns.sort(key=lambda tup: tup[1], reverse=True)
    max_count = max(frequent_patterns,key=itemgetter(1))[1] 
    pattern_return = [s[0] for s in frequent_patterns if s[1] == max_count] 
    return pattern_return

In [None]:
# given collection of strings Dna and an integer d, a k-mer is a (k,d)-motif if it appears in every string from Dna with at most d mismatches

def motifenumeration(DNA_list, k, d):
    n = len(DNA_list)
    new_list = []
    final_pattern = []
    text = DNA_list[0]
    for i in range(len(text) - k + 1):
        pattern = text[i:i+k]
        for i in range(d+1):
            pattern_list = list(generate(pattern, i))
            for p in pattern_list:
                new_list.append(p)
        for pat in new_list:
            final_c = 0
            for dna in DNA_list[1:]:
                count = count_hamd(dna, pat, d)
                if count > 0:
                    final_c += 1
            if final_c == n - 1:
                final_pattern.append(pat)
    final_pattern = list(set(final_pattern))   
    return final_pattern 

In [None]:
#Median string finding function
import sys

switcher = {
    "A": 0,
    "C": 1,
    "G": 2,
    "T": 3
}

def __number_to_symbol(val):
    for symbol, value in switcher.items():
        if(value == val):
            return symbol

def number_to_pattern(index, k):
    if(k == 1):
        return __number_to_symbol(index)
    
    reminder = index % 4
    index = index // 4

    return number_to_pattern(index, k-1) + __number_to_symbol(reminder)

def median_string(dna, k):
    distance = sys.maxsize
    median = []
    for i in range(4**k -1):
        pattern = number_to_pattern(i, k)
        interm_distance = __distance_between_pattern_and_strings(pattern, dna)
        if distance >= interm_distance:
            distance = interm_distance
            median.append((pattern, distance))

    return median
        
def hammingd(text1, text2):
    ham_dis = 0
    for i in range(len(text1)):
        if text1[i] != text2[i]:
            ham_dis += 1
    return ham_dis
    
def __distance_between_pattern_and_strings(pattern, dna):
    
    distance = 0
    for str in dna:
        hamming_distance = sys.maxsize
        for i in range(len(str) - len(pattern) + 1):
            interm_distance = hammingd(pattern, str[i:i+len(pattern)])
            if hamming_distance > interm_distance:
                hamming_distance = interm_distance

        distance += hamming_distance
    
    return distance

In [None]:
# Profile-most probable k-mer in Text
def most_probable_motif(text, k, profile_matrix):
    max_prob = 0
    most_prop_kmer = ""
    for i in range(len(text) - k + 1):
        cur_prop = 1
        kmer = text[i:i+k]
        for i, c in enumerate(kmer):
            cur_prop *= profile_matrix[c][i]
        if max_prob < cur_prop:
            max_prob = cur_prop
            most_prop_kmer = kmer
    
    if most_prop_kmer == "":
        most_prop_kmer = text[:k]

    return most_prop_kmer

In [None]:
# greedy-search motif
from collections import Counter
def generate_profile_matrix(kmers, count):
    profile_matrix = {}
    matrix = {
      "A": [],
      "C": [],
      "G": [],
      "T": []
    } 
    for i, c in enumerate(kmers[0]):
        for letter in ["A", "C", "G", "T"]:
            matrix[letter].append((letter == c) if float(1) else float(0))
    
    for i in range(1, len(kmers)):
        for i, c in enumerate(kmers[i]):
            matrix[c][i] += 1

    #isZeroPresent = False
    #for key, value in matrix.items():
    #    for i, v in enumerate(value):
    #        if matrix[key][i] == 0:
    #            isZeroPresent = True

    #if isZeroPresent:
    #    count += 4
    #    for key, value in matrix.items():
    #        for i, v in enumerate(value):
    #            matrix[key][i] += 1

    for key, value in matrix.items():
        for i, v in enumerate(value):
            matrix[key][i] = round(v/count, 4)

    for i in range(len(kmers[0])):
        c = 0
        for key, value in matrix.items():
            c += matrix[key][i]
        assert abs(c - 1) < 1e-3

    return matrix

def __score_motifs(motifs, k):
    score = 0    
    for i in range(k):
        col = []
        for motif in motifs: 
            col.append(motif[i]);
        most_common_letter = Counter(col).most_common(1)       
        score += sum(1 for v in col if v != most_common_letter[0][0])  
    
    return score

def greedy_motif_search(sequences, k):
    best_motifs = [seq[:k] for seq in sequences]
    motifs = []

    for i in range(len(sequences[0]) - k + 1):
        kmer = sequences[0][i:i+k]
        motifs.append(kmer)
        profile_matrix = generate_profile_matrix(motifs, len(motifs))
        
        for j in range(1, len(sequences)):
            most_prop_kmer_in_seq = most_probable_motif(sequences[j], k, profile_matrix)
            motifs.append(most_prop_kmer_in_seq)
            profile_matrix = generate_profile_matrix(motifs, len(motifs))
        
        if __score_motifs(motifs, k) < __score_motifs(best_motifs, k):
            best_motifs = motifs  
        motifs = []
    return best_motifs

In [None]:
def Profile_Pseudocounts(Motifs,pseudocount):
    count=Count(Motifs)
    profile=count
    for i in range(4):
        for j in range(len(Motifs[0])):
            profile[i][j]=(count[i][j]+pseudocount)/(len(Motifs)+pseudocount*4)
    return profile

def Probability(Pattern, Profile):
    set = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    probability=1
    k=len(Pattern)
    for j in range(k):
        probability = probability * Profile[set[Pattern[j]]][j]
    return probability

def ProfileMostProbableKmer(Text, k, profile):
    prob=[]
    Kmer=[]
    for i in range(len(Text)-k+1):
        pattern=Text[i:i+k]
        probability=Probability(pattern,profile)
        prob.append(probability)
    maxProb=max(prob)
    for i in range(len(Text)-k+1):
        if prob[i]==maxProb:
            Kmer=Text[i:i+k]
            break
    return Kmer

def Count(Motifs):
    count=[]
    if len(Motifs) != 1:
        for i in range(len(Motifs[0])):
            A = 0;C = 0;G = 0; T = 0
            for j in range(len(Motifs)):
                if Motifs[j][i] == 'A': A += 1
                if Motifs[j][i] == 'C': C += 1
                if Motifs[j][i] == 'G': G += 1
                if Motifs[j][i] == 'T': T += 1
            count.append([A, C, G, T])
        count=[list(x) for x in zip(*count)]
    else:
        for j in range(len(Motifs[0])):
            A = 0;C = 0;G = 0; T = 0
            if Motifs[0][j] == 'A': A += 1
            if Motifs[0][j] == 'C': C += 1
            if Motifs[0][j] == 'G': G += 1
            if Motifs[0][j] == 'T': T += 1
            count.append([A, C, G, T])
        count = [list(x) for x in zip(*count)]
    return count

def Score(Motifs):
    consensus=Consensus(Motifs)
    score=0
    for i in range(len(Motifs)):
        score+=hammingd(consensus,Motifs[i])
    return score

def Consensus(Motifs):
    set = {0:'A', 1:'C', 2:'G', 3:'T'}
    consensus=''
    if len(Motifs[0])!=1:
        for i in range(len(Motifs[0])):
            A = 0; C = 0; G = 0; T = 0
            for j in range(len(Motifs)):
                if Motifs[j][i]=='A': A+=1
                if Motifs[j][i]=='C': C+=1
                if Motifs[j][i]=='G': G+=1
                if Motifs[j][i]=='T': T+=1
            S=[A,C,G,T]
            Max=max(S)
            for n in range(4):
                if S[n]==Max:
                    consensus=consensus+set[n]
                    break
    else: print('Motifs has one string')
    return consensus

def GreedyMotifSearchWithPseudocounts(Dna, k, t, pseudocount):
    BestMotifs = []
    for i in range(len(Dna)):
        BestMotifs.append(Dna[i][:k])
    for i in range(len(Dna[0]) - k + 1):
        Motif1 = Dna[0][i:i + k]
        Motifs = []
        Motifs.append(Motif1)
        for j in range(1, t):
            Motifi = ProfileMostProbableKmer(Dna[j], k, Profile_Pseudocounts(Motifs,pseudocount))
            Motifs.append(Motifi)
        if Score(Motifs) < Score(BestMotifs):
            BestMotifs = Motifs
    return BestMotifs

In [2]:
#Random Motif search 
import random

N = 1000
def randomized_motif_search(Dna, k, t):
    M = random_motifs(Dna, k, t)
    bestMotifs = M
    while True:
        profile = profile_with_pseudocounts(M)
        M = _motifs(profile, Dna)
        if _score(M) < _score(bestMotifs):
            bestMotifs = M
        else:
            return bestMotifs

###############################################MAINCODE#########################################################################

###################################################SubCodes#####################################################################
def random_motifs(Dna, k, t):
    randMotifs = []

    for i in range(t):
        x = random.randint(0, t)
        randMotifs.append(Dna[i][x:x+k])

    return randMotifs

def _motifs(profile, Dna):
    motifs = []
    t = len(Dna)
    k = len(profile['A'])

    for i in range(t):
        motifs.append(profile_most_probable_kmer(Dna[i], k, profile))

    return motifs

def profile_most_probable_kmer(text, k, profile):
    mostProbVal = -1
    mostProbKmer = ''

    for i in range(0, 1 + len(text) - k):
        kmer = text[i:i+k]
        probKmerVal = _pr(kmer, profile)
        if probKmerVal > mostProbVal:
            mostProbVal = probKmerVal
            mostProbKmer = kmer

    return mostProbKmer

def _pr(text, profile):
    P = 1

    for i in range(len(text)):
        P = P * profile[text[i]][i]

    return P

def profile_with_pseudocounts(motifs):
    profile = {}
    t = len(motifs)
    k = len(motifs[0])
    countMotifs = count_with_pseudocounts(motifs)

    for symbol in "ACGT":
        profile[symbol] = []

    for x in countMotifs:
        for y in countMotifs[x]:
            z = y/float(t+4)
            profile[x].append(z)

    return profile

def count_with_pseudocounts(motifs):
    count = {}
    pseudocounts = {}
    t = len(motifs)
    k = len(motifs[0])

    for symbol in "GACT":
        count[symbol] = []
        for j in range(k):
            count[symbol].append(0)

    for i in range(t):
        for j in range(k):
            symbol = motifs[i][j]
            count[symbol][j] += 1

    for symbol in "GACT":
        pseudocounts[symbol] = []

    for x in count:
        for y in count[x]:
            z = y + 1
            pseudocounts[x].append(z)

    return pseudocounts

def _score(motifs):
    count = 0
    k = len(motifs[0])
    t = len(motifs)
    consensusMotif = _consensus(motifs)

    for i in range(t):
        for j in range(k):
            if motifs[i][j] != consensusMotif[j]:
                count += 1

    return count

def _consensus(motifs):
    k = len(motifs[0])
    count = count_with_pseudocounts(motifs)
    consensus = ""

    for j in range(k):
        M = 0
        frequentSymbol = ""
        for symbol in "ACGT":
            if count[symbol][j] > M:
                M = count[symbol][j]
                frequentSymbol = symbol
        consensus += frequentSymbol

    return consensus
###################################################SubCodes#####################################################################
randomized_motif_search(dna, 15, 20)
M = randomized_motif_search(dna, 15, 20)
bMotifs = M

for i in range(N+1):
    M = randomized_motif_search(dna, 15, 20)
    if _score(M) < _score(bMotifs):
         bMotifs = M
    else:
        bestMotifs = bMotifs

print ('\n'.join(bestMotifs))

In [None]:
#Gibb's sampler motif search
from collections import Counter
def ProfileRandomlyGeneratedKmer(Text,profile,k):
    prob=[]
    for i in range(len(Text)-k+1):
        pattern=Text[i:i+k]
        prob.append(Probability(pattern,profile))
    Sum=sum(prob)
    for i in range(len(Text)-k+1):
        prob[i]=prob[i]/Sum
    Range = [0]
    for i in range(len(Text) - k+1):
        Range.append(prob[i]+Range[i])
    Range.append(1)
    Random=random.uniform(0,1)
    RandomPattern = ''
    for i in range(len(Text) - k+1):
        if Random>=Range[i] and Random<=Range[i+1]:
            RandomPattern=Text[i:i+k]
    return RandomPattern

def Profile_Pseudocounts(Motifs,pseudocount):
    count=Count(Motifs)
    profile=count
    for i in range(4):
        for j in range(len(Motifs[0])):
            profile[i][j]=(count[i][j]+pseudocount)/(len(Motifs)+pseudocount*4)
    return profile

In [None]:
def GibbsSampler(Dna, k, t, N):
    motifs = []
    for i in range(t):
        x = random.randint(0, len(Dna[0]) - k)
        motifs.append(Dna[i][x:x + k])
    BestMotifs = motifs.copy()
    for j in range(N):
        i=random.randint(0,t-1)
        motifs_i=motifs[0:i]+motifs[i+1:]
        Profile = Profile_Pseudocounts(motifs_i, 1)
        motifs[i]=ProfileRandomlyGeneratedKmer(Dna[i],Profile,k)
        if Score(motifs) < Score(BestMotifs):
            BestMotifs=motifs.copy()
    return BestMotifs
###################################################################
def GibbsSampler_nTime(Dna, k, t, N, repeat):
    score = []
    motifs = []
    for i in range(repeat):
        m = GibbsSampler(Dna,k,t,N)
        score.append(Score(m))
        motifs.append(m)
    Min = min(score)
    Finalmotifs = []
    for i in range(repeat):
        if score[i] == Min:
            Finalmotifs = motifs[i]
    return Finalmotifs

In [None]:
#GibbsSampler_nTime(dna_list, 15, 20, 2000, 50)