In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [None]:
file_names = 'non-overlapping_includingbidirectional_all_genes_newannotation'
output_prefix = 'czechowski_TAU'


In [None]:
#location of files
czechowski = '../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/genes/promoters_5UTR_czechowski_constitutive_variable_random.txt'
czechowski_all = '../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/genes/promoters_5UTR_czechowski_allfilteredgenes.txt'
tau_table = '../../data/output/non-overlapping_includingbidirectional_all_genes_newannotation/genes/tissue_specific/promoters_5UTR_czechowski_allfilteredgenes_TAU.txt

In [None]:
#make directory for the plots to be exported to
dirName = f'../../data/output/{file_names}/genes/plots'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

In [None]:
def all_prom_distribution(df, x_variable, x_label, df2=pd.DataFrame(),df1_label='', df2_label='', labels=False,
                          min_x_constitutive=False, max_x_constitutive=False,min_x_variable=False,max_x_variable=False, save=False):
    """function to return distribution plot of all promoters of variable of interest. 
    df1_label and df2 labels are the names of the respective gene type subset in the df"""
    #if only 1 dataframe provided then create just 1 plot
    if df2.empty:    
        dist_plot = df[x_variable]
        #create figure with no transparency
        dist_plot_fig = sns.distplot(dist_plot).get_figure()
        plt.xlabel(x_label)
    #else if 2 dataframes provided plot them on the same axes
    else:
        dist_plot1 = df[x_variable]
        dist_plot2 = df2[x_variable]
        dist_plot_fig = sns.distplot(dist_plot1,hist=False, rug=True,label=df1_label)
        sns.distplot(dist_plot2,hist=False, rug=True,label=df2_label).get_figure()
        #create legend
        plt.legend()
    if labels==True:
        #get axes
        ax = plt.axes()
        #constitutive annotation
        ax.annotate('top 100 constitutive range', xy=(max_x_constitutive, 0.2), xycoords='data', ha='left',
            xytext=(50, 100), textcoords='offset points',
                   arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.4', 
                            color='black'))
        ax.annotate('', xy=(max_x_constitutive, 0.2),
            xytext=(min_x_constitutive, 0.2),
            xycoords='data', textcoords='data',
            arrowprops={'arrowstyle': '|-|,widthA=0.2,widthB=0.2','color':'blue'})
        #Variable annotation
        ax.annotate('top 100 variable range', xy=(max_x_variable, 0.2), xycoords='data', ha='right',
            xytext=(0, -20), textcoords='offset points')
        ax.annotate('', xy=(max_x_variable, 0.2),
            xytext=(min_x_variable, 0.2),
            xycoords='data', textcoords='data',
            arrowprops={'arrowstyle': '|-|,widthA=0.2,widthB=0.2','color':'orange' })
        

    #save to file
    if save == True:
        dist_plot_fig.savefig(f'../../data/output/{file_names}/genes/plots/{output_prefix}_distribution.pdf', format='pdf')
    return dist_plot_fig

In [None]:
#read in files
czechowski_df = pd.read_table(czechowski, sep='\t', header=None)
#all genes
czechowski_all_df = pd.read_table(czechowski_all, sep='\t', header=0)
#TAU
tau_df = pd.read_table(tau_table, sep='\t', header=0)