In [1]:
from pprint import pprint
import numpy as np
import random
import motif_ops
import seq_ops
import seq_tools
import seqlogo
from scipy.special import softmax


  from pkg_resources import DistributionNotFound, get_distribution


In [2]:
#The whole function

#read sequences from file
k = 8
ic_list=[]

seqs = seq_tools.read_file("./data/only_peaks.bed")


#randomly choose motif start sites
motif_starts = seq_tools.build_motif_starts(seqs, k)


#Intialize pfm, just the first time.

pfm = motif_ops.build_pfm(seq_tools.get_kmer_list(seqs, motif_starts, k), k)

# LOOP
i = 0 # count number of iterations
converged = False
max_iterations = 5000

while (converged != True) and (i <= max_iterations):
    
    #save a copy for next time to compare for convergence
    pfm_old = pfm.copy()
    
    # select kmer to score
    pick = random.randrange(len(motif_starts))
    
    
    # remove chosen kmer to score from pfm
    
    seq_tools.change_pfm(seqs[pick][motif_starts[pick]:motif_starts[pick]+k], k, "sub", pfm)
    
    
    #remove chosen kmer from list
    
    removed_seq = seqs.pop(pick)
    removed_index = motif_starts.pop(pick)
    
    
    #build pwm
    
    pwm = motif_ops.build_pwm(pfm)
    
    
    #get reverse complement
    rev_seq = seq_ops.reverse_complement(removed_seq)
    
    
    #score kmers
    
    kmer_scores = []
    #score forward seq
    for x in range(len(removed_seq) - k):
        score = motif_ops.score_kmer(removed_seq[x:x+k], pwm)
        kmer_scores.append(score)
    
    #score reverse seq
    for x in range(len(removed_seq) - k):
        score = motif_ops.score_kmer(rev_seq[x:x+k], pwm)
        kmer_scores.append(score)
    
    
    # choose "best" motif score using softmax calculation
    prob = softmax(kmer_scores)
    
    new_idx = np.random.choice(np.arange(len(kmer_scores)), p=prob)
    
    
    # use modulo to determine if chosen motif is on forward or reverse sequence
    quotient, remainder = divmod(new_idx, len(removed_seq) - k - 1)
    if quotient == 0: # forward sequence
        new_motif = removed_seq[remainder:remainder+k]
        seq_to_add_back = removed_seq
    else: # reverse sequence
        new_motif = rev_seq[remainder:remainder+k]
        seq_to_add_back = rev_seq
    
    
    # Insert the sequence and new index
    seqs.append(seq_to_add_back)
    motif_starts.append(remainder)
    
    # add new better motif to pfm
    
    seq_tools.change_pfm(new_motif, k, "add", pfm)
    
    # calculate information content and check for convergence
    threshold = 0.0000000005
    new_ic = motif_ops.pfm_ic(pfm)
    old_ic = motif_ops.pfm_ic(pfm_old)
    ic_diff = abs(new_ic-old_ic)
    ic_list.append(new_ic)
    
    if ic_diff < threshold:
        print(f"\rIteration {i}, we converged with IC of {ic_diff}", end="", flush=True)        
        #converged = True
    else: 
        print(f"\rIteration {i}, did not converge yet with IC of {ic_diff}", end="", flush=True)
    i += 1
    


Iteration 5000, did not converge yet with IC of 0.00304161492000343265

In [None]:
#view pfm
norm_pfm = pfm / pfm.sum(axis = 1, keepdims = True)
pprint(norm_pfm.T)

In [None]:
seqlogo.seqlogo(seqlogo.CompletePm(pfm = norm_pfm.T), ic_scale=False)

In [None]:
seqlogo.seqlogo(seqlogo.CompletePm(pfm = norm_pfm.T), ic_scale=True)

In [None]:
#Visualize IC content over iterations

import matplotlib.pyplot as plt

plt.plot(ic_list)
plt.xlabel('Iterations')
plt.ylabel('Score')
plt.title('IC Content over time')
plt.show()