# Plot the retrieved motifs

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import os
from os.path import isfile, join
from itertools import compress
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.backends.backend_pdf

### helper functions

In [2]:
def generate_kmer_inx():
    vals = {'A':0,'C':1,'G':2,'T':3}
    #vals = {0:'A', 1:'C', 2:'G', 3:'T'}
    kmer_inx = {}
    for p in list(itertools.product(vals.keys(), repeat=l)):
        inx = 0
        for j,base in enumerate(p):
            inx += (4**j)*vals[base] 
        kmer_inx[''.join(p)] = inx
    return kmer_inx

In [3]:
def read_params(files):
    params = []
    for f in files:
        param = np.loadtxt(join(param_dir, f))
        param[-3:] = np.exp(param[-3:])

        params.append(param)
        
    return params

In [4]:
l = 3 #l_A=l_B=3 nucleotides
l_p = 3 #persistence length is 3 nucleotides 

kmer_inx = generate_kmer_inx()
inx_kmer = {y:x for x,y in kmer_inx.items()}

### read files

In [5]:
param_dir = 'param'
#keyw = 'TAG_TAG_old'
keyw = 'HNRNPA0_0t4_5000'
keyw = 'KHDRBS2_0t4_5000'

param_files = [f for f in os.listdir(param_dir) if isfile(join(param_dir, f))]

to_pick = [keyw in s for s in param_files]
param_files = list(compress(param_files, to_pick))

adam_files = ['ADAM' in s for s in param_files]
adam_files = list(compress(param_files, adam_files))

lbfgs_files = ['LBFGS' in s for s in param_files]
lbfgs_files = list(compress(param_files, lbfgs_files))

In [6]:
print('number of files: %d'%len(adam_files))
adam_params = read_params(adam_files)

number of files: 27


### plotting

In [7]:
#structure data as pandas DF
colnames = [inx_kmer[i] for i in range(len(inx_kmer))] + [inx_kmer[i] for i in range(len(inx_kmer))] + ['sf', 'D', 'sig']

data = pd.DataFrame(adam_params, columns=colnames)
core1 = data.iloc[:,:64]
core1 = core1.loc[:,core1.median().sort_values().index]
core2 = data.iloc[:,64:128]
core2 = core2.loc[:,core2.median().sort_values().index]

In [None]:
pdf = matplotlib.backends.backend_pdf.PdfPages("summary_%s.pdf"%keyw)

#plot motifs
fig, (ax1,ax2) = plt.subplots(2,1, figsize=(14,12))
plt.subplots_adjust(hspace=0.3)

core1.boxplot(ax=ax1, rot=90)
ax1.set_title('Core I', fontsize=12)
ax1.set_ylabel('binding energy (KbT)', fontsize=12)

core2.boxplot(ax=ax2, rot=90)
ax2.set_title('Core II', fontsize=12)
ax2.set_ylabel('binding energy (KbT)', fontsize=12)

pdf.savefig(fig)

#plot distance attributes and energy distribution
fig, axes = plt.subplots(3,3, figsize=(10,10))
plt.subplots_adjust(hspace=0.3)

ax1, ax2, ax3 = axes[0]

x1 = ax1.hist(data['sf'])
ax1.set_title('sf')

x2 = ax2.hist(data['D'])
ax2.set_title('D')

x3 = ax3.hist(data['sig'])
ax3.set_title('sig')

for i in range(3):
    axes[1][i].hist(data.iloc[i,:64],  color='#85adad')
    axes[1][i].set_title('#%d core I'%(i+1))
    
for i in range(3):
    axes[2][i].hist(data.iloc[i,64:128], color='#ff6666')
    axes[2][i].set_title('#%d core II'%(i+1))
    
pdf.savefig(fig)
pdf.close()

In [None]:
data.iloc[:,-3:]