In [76]:
%matplotlib inline

import os as os
import collections as col
import itertools as itt
import pickle as pck
import time as ti

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import numpy as np
import numpy.random as rng
import scipy.stats as stats
import pandas as pd
import seaborn as sns

# What does this do?
# Plot boxplots of HSP scores
# (not) overlapping with genes
# and regulatory regions

date = '20180403'

run_plot_hsp_gene_ovl = True
run_plot_hsp_ensreg_ovl = True

save_figures = True

sns.set(style='white',
        font_scale=1.5,
        rc={'font.family': ['sans-serif'],
            'font.sans-serif': ['DejaVu Sans']})

fhgfs_base = '/TL/deep/fhgfs/projects/pebert/thesis/projects/statediff'
cache_dir = os.path.join(fhgfs_base, 'caching/notebooks')

hsp_gene_ovl_folder = os.path.join(fhgfs_base, 'bedtools/deep/gene_isect')
hsp_ensreg_ovl_folder = os.path.join(fhgfs_base, 'bedtools/deep/ensreg_isect')
de_gene_folder = os.path.join(fhgfs_base, 'deseq/deep')
bed_gene_folder = os.path.join(fhgfs_base, 'deseq/bed_out')

base_out = '/TL/deep-external01/nobackup/pebert/cloudshare/mpiinf/phd/chapter_projects/statediff'
fig_supp = os.path.join(base_out, 'figures', 'pub', 'supp')
fig_main = os.path.join(base_out, 'figures', 'pub', 'main')
fig_collect = os.path.join(base_out, 'figures', 'pub', 'collection')
                   
hsp_header = "chrom  start_bp    end_bp  name    \
              norm_nat_score  norm_nat_score_std  \
              segment_pv  summed_pv   start_bin   \
              end_bin group1  group2  num_bins    \
              num_bootstraps  num_comparisons \
              num_merged  raw_score   score_pct   score_rank"
hsp_header = hsp_header.split()
hsp_header = ['hsp_' + h for h in hsp_header]

deseq_header = "chrom  start   end name    log2fc  strand  symbol  pv_adj"
deseq_header = deseq_header.split()
deseq_header = ['de_' + h for h in deseq_header]

gene_table_header = hsp_header + ['fileid'] + deseq_header + ['overlap']

ensreg_header = "chrom start end name score strand feature"
ensreg_header = ensreg_header.split()
ensreg_header = ['rgb_' + h for h in ensreg_header]

ensreg_table_header = hsp_header + ensreg_header + ['overlap']
    

def get_gene_counts(comparison):
    diff_genes = os.path.join(bed_gene_folder, 'deseq2_{}_diff_body.bed'.format(comparison))
    df = pd.read_csv(diff_genes, sep='\t', header=0)
    num_diff = df.shape[0]
    
    stable_genes = os.path.join(bed_gene_folder, 'deseq2_{}_stable_body.bed'.format(comparison))
    df = pd.read_csv(stable_genes, sep='\t', header=0)
    num_stable = df.shape[0]
    return num_diff, num_stable
    

def cache_ensreg_ovl_data(rootfolder, cache_file):
    
    with pd.HDFStore(cache_file, 'w') as hdf:
        
        for table in os.listdir(rootfolder):
            uniq = []
            infos = table.split('.')[0].split('_')
            tool, scoring = infos[4], infos[-1]
            comparison = '_'.join(infos[5:8])
            fpath = os.path.join(rootfolder, table)
            df = pd.read_csv(fpath, sep='\t', names=ensreg_table_header, header=None)
            df.loc[df['overlap'] == 0, 'rgb_feature'] = 'empty'
            
            for ft in df['rgb_feature'].unique():
                sub = df.loc[df['rgb_feature'] == ft, ['hsp_name', 'hsp_norm_nat_score', 'rgb_feature']].copy()
                sub.drop_duplicates(subset=['hsp_name'], inplace=True)
                uniq.append(sub)
            uniq = pd.concat(uniq, axis=0, ignore_index=False)

            cache_path = os.path.join(tool, comparison, scoring)
            hdf.put(cache_path, uniq, format='table')
    return cache_path

    
def cache_gene_ovl_data(rootfolder, cache_file):
    
    with pd.HDFStore(cache_file, 'w') as hdf:
    
        for table in os.listdir(rootfolder):
            infos = table.split('.')[0].split('_')
            tool, scoring = infos[4], infos[-1]
            comparison = '_'.join(infos[5:8])
            num_diff, num_stable = get_gene_counts(comparison)
            fpath = os.path.join(rootfolder, table)
            df = pd.read_csv(fpath, sep='\t', names=gene_table_header, header=None)
            df['gene_length'] = df['de_end'] - df['de_start']
            df = df.loc[:, ['hsp_name', 'hsp_norm_nat_score',
                            'fileid', 'de_name', 'de_log2fc',
                            'overlap', 'gene_length']].copy()
            df['de_group'] = -1
            df['len_group'] = -1
            df['total_num'] = df['hsp_name'].unique().size
            df.loc[df['fileid'] == 'STABLE', 'de_group'] = 0
            df.loc[df['fileid'] == 'STABLE', 'len_group'] = 0
            
            # treat DE genes separately for convenience
            diffs = df.loc[df['fileid'] == 'DIFF', :].copy()
            diffs['de_log2fc'] = diffs['de_log2fc'].astype(np.float32).abs()
            lower, upper = np.percentile(diffs['de_log2fc'], [25, 75])
            diffs.loc[diffs['de_log2fc'] < lower, 'de_group'] = 1
            diffs.loc[diffs['de_log2fc'] > upper, 'de_group'] = 3
            diffs.loc[diffs['de_group'] == -1, 'de_group'] = 2
            
            # same for gene length
            lower, upper = np.percentile(diffs['gene_length'], [25, 75])
            diffs.loc[diffs['gene_length'] < lower, 'len_group'] = 1
            diffs.loc[diffs['gene_length'] > upper, 'len_group'] = 3
            diffs.loc[diffs['len_group'] == -1, 'len_group'] = 2

            # merge data back together
            diffs.drop(['de_log2fc'], axis=1, inplace=True)
            df.drop(['de_log2fc'], axis=1, inplace=True)
            df = df.loc[df['fileid'] != 'DIFF', :].copy()
            df = pd.concat([df, diffs], axis=0, ignore_index=False)
            df.reset_index(drop=True, inplace=True)
            df.loc[df['fileid'] == 'DIFF', 'total_num'] = num_diff
            df.loc[df['fileid'] == 'STABLE', 'total_num'] = num_stable
            
            cache_path = os.path.join(tool, comparison, scoring)
            hdf.put(cache_path, df, format='table')
        
    return cache_file
    

def create_boxplot(data, title, data_group, x_label):
    """
    """
    boxcolor = 'dimgrey'
    medcolor = 'grey'
    median_props = {'color': medcolor, 'linewidth': 2}
    box_props = {'color': boxcolor, 'linewidth': 2}
    whisker_props = {'color': boxcolor, 'linewidth': 2}
    cap_props = {'color': boxcolor, 'linewidth': 2}
    
    if data_group == 'ensreg':
        cat = ['empty', 'ctcf', 'open', 'tfbs', 'enh', 'flank', 'prom']
        score_data = [data.loc[data['rgb_feature'] == x, 'hsp_norm_nat_score'].values for x in cat]
        score_labels = ['HSPs\nw/o ovl.', 'CTCF', 'Open\nchromatin', 'TFBS',
                        'Enhancer', 'Promoter\nflanking', 'Promoter']
    else:
        score_data = [data.loc[data[data_group] == x, 'hsp_norm_nat_score'].values for x in [-1, 0, 1, 2, 3]]
        score_labels = ['HSPs\nw/o ovl.', 'Non-DE\ngenes', 'DE genes\n(bottom 25%)',
                        'DE genes\n(mid 50%)', 'DE genes\n(top 25%)']
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.boxplot(score_data, sym="", labels=score_labels,
               medianprops=median_props, boxprops=box_props,
               whiskerprops=whisker_props, capprops=cap_props)
    ax.set_xlabel(x_label)
    ax.set_ylabel('HSP scores')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    tt = ax.set_title(title)
    tt.set_position([0.5, 1.02])
    return fig, []
    
            
def plot_hsp_gene_ovl():
    cache_file = os.path.join(cache_dir, '{}_plot_hsp_gene_ovl.h5'.format(date))
    if not os.path.isfile(cache_file):
        _ = cache_gene_ovl_data(hsp_gene_ovl_folder, cache_file)
    elif os.stat(cache_file).st_size < 10e6:
        _ = cache_gene_ovl_data(hsp_gene_ovl_folder, cache_file)
    else:
        print('Assuming cache file is valid')
    with pd.HDFStore(cache_file, 'r') as hdf:
        for k in hdf.keys():
            _, seg, comp, scoring = k.split('/')
            if seg != 'cmm18':
                continue
            c1, c2 = comp.split('_vs_')
            fig_title = 'Scores of HSPs overlapping genes: {} vs {} ({} {} scoring)'.format(c1, c2, seg.upper(), scoring)
            xlabels = {'de_group': 'DE genes stratified by absolute fold change',
                       'len_group': 'DE genes stratified by gene body length'}
            for grouping in ['de_group', 'len_group']:
                fig, exart = create_boxplot(hdf[k], fig_title, grouping, xlabels[grouping])
                if save_figures:
                    suffix = grouping.split('_')[0].upper()
                    outname = '{}_fig_X_hspscore_genes_{}_{}_{}_vs_{}_{}'.format(date, seg, scoring, c1, c2, suffix)
                    out_svg = os.path.join(fig_collect, outname + '.svg')
                    fig.savefig(out_svg, bbox_inches='tight', extra_artists=exart)
                    out_pdf = os.path.join(fig_collect, outname + '.pdf')
                    fig.savefig(out_pdf, bbox_inches='tight', extra_artists=exart)
                    out_png = os.path.join(fig_collect, outname + '.png')
                    fig.savefig(out_png, bbox_inches='tight', extra_artists=exart, dpi=300)
                plt.close(fig)
    return 0


def plot_hsp_ensreg_ovl():
    cache_file = os.path.join(cache_dir, '{}_plot_hsp_ensreg_ovl.h5'.format(date))
    if not os.path.isfile(cache_file):
        _ = cache_ensreg_ovl_data(hsp_ensreg_ovl_folder, cache_file)
    elif os.stat(cache_file).st_size < 10e6:
        _ = cache_ensreg_ovl_data(hsp_ensreg_ovl_folder, cache_file)
    else:
        print('Assuming cache file is valid')
    with pd.HDFStore(cache_file, 'r') as hdf:
        for k in hdf.keys():
            _, seg, comp, scoring = k.split('/')
            if seg != 'cmm18':
                continue
            c1, c2 = comp.split('_vs_')
            fig_title = 'Scores of HSPs overlapping regulatory regions: {} vs {} ({} {} scoring)'.format(c1, c2, seg.upper(), scoring)
            fig, exart = create_boxplot(hdf[k], fig_title, 'ensreg', 'Ensembl regulatory build regions')
            if save_figures:
                outname = '{}_fig_X_hspscore_ensreg_{}_{}_{}_vs_{}'.format(date, seg, scoring, c1, c2)
                out_svg = os.path.join(fig_collect, outname + '.svg')
                fig.savefig(out_svg, bbox_inches='tight', extra_artists=exart)
                out_pdf = os.path.join(fig_collect, outname + '.pdf')
                fig.savefig(out_pdf, bbox_inches='tight', extra_artists=exart)
                out_png = os.path.join(fig_collect, outname + '.png')
                fig.savefig(out_png, bbox_inches='tight', extra_artists=exart, dpi=300)
            plt.close(fig)
    return 0
     
    
if run_plot_hsp_gene_ovl:
    plot_hsp_gene_ovl()
    
if run_plot_hsp_ensreg_ovl:
    plot_hsp_ensreg_ovl()


Assuming cache file is valid
Assuming cache file is valid
