In [21]:
%matplotlib inline

import os as os
import json as js
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Verdana', 'Tahoma']
rcParams['font.size'] = 16
rcParams['svg.fonttype'] = 'none'

# figure size
fig_width = 7
fig_height = 5

sns.set_style('darkgrid')

# ======
ctype = js.load(open('/home/pebert/work/code/mpggit/creepiest/config/annotations/encode_ctype_match.json', 'r'))
input_dir = '/home/pebert/temp/creepiest/figures/sigcorr/input'
species_corr = os.path.join(input_dir, 'species_corr')
cell_corr = os.path.join(input_dir, 'cell_corr')
project_corr = os.path.join(input_dir, 'project_corr')
output_dir = '/home/pebert/temp/creepiest/figures/sigcorr/output'
outname = 'corr_histsig_{}_from_{}_{}_roi_{}'

roilabels = {'cgi': 'CpG islands', 'venh': 'VISTA enhancer', 'fenh': 'FANTOM enhancer'}

# ======

def sort_same_input_data(folder):
    """
    """
    assert os.path.isdir(folder), 'Folder does not exist: {}'.format(folder)
    plotdata = dict()
    for root, dirs, files in os.walk(folder):
        if files:
            _, condition = os.path.split(root)
            for fn in files:
                if not fn.endswith('.json'):
                    continue
                corrdata = js.load(open(os.path.join(root, fn), 'r'))['correlations']
                parts = fn.split('_')
                from_cell, to_cell = parts[4], parts[10]
                assert from_cell in ctype and to_cell in ctype, 'Unknown cell: {} or {}'.format(from_cell, to_cell)
                species = parts[3]
                roi = parts[1]
                mark = parts[5]
                if (condition, mark, roi) not in plotdata:
                    plotdata[(condition, mark, roi)] = [c[1]['pearson']['stat'] for c in corrdata]
                else:
                    plotdata[(condition, mark, roi)].extend([c[1]['pearson']['stat'] for c in corrdata])
    return plotdata
    

def sort_cross_input_data(folder):
    """
    """
    assert os.path.isdir(folder), 'Folder does not exist: {}'.format(folder)
    plotdata = dict()
    for root, dirs, files in os.walk(folder):
        if files:
            _, condition = os.path.split(root)
            for fn in files:
                if not fn.endswith('.json'):
                    continue
                corrdata = js.load(open(os.path.join(root, fn), 'r'))['correlations']
                parts = fn.split('_')
                src_tissue = parts[4]
                trg_tissue = parts[10]
                mark = parts[5]
                roi = parts[1]
                if (condition, mark, roi) not in plotdata:
                    plotdata[(condition, mark, roi)] = {'match': [], 'unmatch': []}
                parts = fn.split('.')
                map_tissue = parts[4]
                if trg_tissue not in ctype[map_tissue]:
                    continue
                is_match = map_tissue in ctype[src_tissue]
                if is_match:
                    try:
                        plotdata[(condition, mark, roi)]['match'].extend([c[1]['pearson']['stat'] for c in corrdata])
                    except KeyError:
                        plotdata[(condition, mark, roi)]['match'] = [c[1]['pearson']['stat'] for c in corrdata]
                else:
                    try:
                        plotdata[(condition, mark, roi)]['unmatch'].extend([c[1]['pearson']['stat'] for c in corrdata])
                    except KeyError:
                        plotdata[(condition, mark, roi)]['unmatch'] = [c[1]['pearson']['stat'] for c in corrdata]
    return plotdata


def make_boxplot(condition, mark, roi, species, match, unmatch, project):
    """
    """
    to_species, from_species = condition.split('_from_')
    fig_title = '{} cons. sig. in {} - {} from {}'.format(mark, roilabels[roi], to_species, from_species)
    if project is not None:
        positions = [1, 2, 3, 4]
    else:
        positions = [1, 2, 3]
    tick_fontsize = 14
    label_fontsize = 16
    
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(fig_width, fig_height)
    
    ax.set_title(fig_title, fontsize=label_fontsize)
    ax.set_ylim(-0.5, 1.05)
    ax.set_ylabel('Pearson correlation\nper chromosome', fontsize=label_fontsize)
    
    if project is not None:
        plot_boxes = [species, match, unmatch, project]
    else:
        plot_boxes = [species, match, unmatch]
    boxprops = dict(linestyle='solid', linewidth=1, color='black')
    medianprops = dict(linestyle='solid', linewidth=1, color='black')
    whisprops = dict(linestyle='solid', linewidth=1, color='black')
    boxes = ax.boxplot(plot_boxes, positions=positions, widths=0.5, patch_artist=True, showfliers=False,
                       boxprops=boxprops, medianprops=medianprops, whiskerprops=whisprops, notch=False)
    for pat, col in zip(boxes['boxes'], ['darkgreen', 'red', 'blue', 'orange']):
        pat.set_facecolor(col)
        pat.set_alpha(0.5)

    ax.tick_params(axis='x', direction='out', color='black', bottom='off', labelsize=tick_fontsize,
                   top='off', width=1, length=6, labelbottom='on', labeltop='off')
    ax.set_xticklabels(['{} <!> {}'.format(to_species, to_species),
                        '{} <=> {}'.format(to_species, from_species),
                        '{} <!> {}'.format(to_species, from_species)], fontsize=label_fontsize)
    
    ax.tick_params(axis='y', direction='out', color='black',left='off', labelsize=tick_fontsize,
                   right='off', width=1, length=6, labelleft='on', labelright='off')
    #plt.show()
    for ext in ['.png', '.svg', ]:
        outpath = os.path.join(output_dir, outname.format(to_species, from_species, mark, roi))
        fig.savefig(outpath + ext, dpi=300, bbox_inches='tight', pad_inches=0.5)
    plt.close(fig)
    
    return

plotsame = sort_same_input_data(species_corr)
plotcross = sort_cross_input_data(cell_corr)

for k, v in plotsame.items():
    species = np.array(v, dtype=np.float32)
    cross_data = plotcross[k]
    match = np.array(cross_data['match'], dtype=np.float32)
    unmatch = np.array(cross_data['unmatch'], dtype=np.float32)
    make_boxplot(k[0], k[1], k[2], species, match, unmatch, None)
print('Done')



    
    

Done
