### Peak annotation results visualisation

Annotation was done with HOMER and ChIPseeker

In [None]:
from pathlib import Path
import matplotlib as mpl

mpl.rcParams['font.sans-serif'] = ['Arial']
mpl.rcParams['font.family'] = 'sans-serif'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

In [None]:
SAVE_FIGS = True

Drawing helper functions

In [None]:
def add_alpha(rgb_str, alpha_value):
    return rgb_str+f"{alpha_value:0{2}x}".upper()

In [None]:
def generate_color_list(color_mapping, samples, alpha):
    return [add_alpha(color_mapping[sample], alpha) for sample in samples]

In [None]:
def generate_alpha_gradient(rgb_str, steps_cnt, step_size):
    return [add_alpha(rgb_str, 255-x*step_size) for x in range(steps_cnt)]

# Test
# generate_alpha_gradient("#AABBCC", 4, 30)

#### HOMER annotation analysis summary for dougnut plots

In [None]:
SRC_DIR_PATH = Path(r"/path/to/peaks_anno/homer")

In [None]:
DEST_FIG_DIR_PATH = Path(r"/path/to/peaks_anno/homer")

In [None]:
# Less verbose, space-lacking column names
col_names = ['PeakID', 'Chr', 'Start', 'End', 'Strand', 'Peak_Score', 
             'Focus_Ratio_Region_Size', 'Annotation', 'Detailed_Annotation', 
             'Distance_to_TSS', 'Nearest_PromoterID', 'Entrez_ID', 
             'Nearest_Unigene', 'Nearest_Refseq', 'Nearest_Ensembl', 
             'Gene_Name', 'Gene_Alias', 'Gene_Description', 'Gene_Type']

In [None]:
# HOMER annotation result file
anno_file_name = "Example_peaks_homer_anno.tsv"

In [None]:
anno_data = pd.read_csv(SRC_DIR_PATH.joinpath(anno_file_name), sep='\t', header=None, skiprows=1)

In [None]:
anno_data.columns = col_names

In [None]:
anno_data.head(3)

The data in Annotation column may contain ENTREZ ID in the brackets.
Let's get rid of this.

In [None]:
def split_anno(anno):
    try:
        res = anno.split("(")[0]
    except:
        return anno
    return res

annotations = anno_data['Annotation'].apply(split_anno)

Get rid of white spaces and NA entries

In [None]:
annotations = [anno.strip() for anno in annotations if not pd.isna(anno)]

In [None]:
# len(annotations)

Merge "intron" and "exon" as a "gene body"

In [None]:
annotations_bis = ['Gene body' if anno in ['intron', 'exon'] else anno for anno in annotations]
annotations_bis = ['TES' if anno=='TTS' else anno for anno in annotations_bis]

In [None]:
ct = Counter(annotations_bis)

In [None]:
ct

In [None]:
ct_dict = dict(ct)

In [None]:
ct_dict

In [None]:
# Color mappings, so we can assign a specific color 
# to each data set accross entrire project

color_mappings = {
    'Example_Set_1': '#3C5488',
    'Example_Set_2': '#00A087',
    'Example_Set_3': '#E64B35',
    'Example_Set_4': '#8491B4',
}

In [None]:
colors = generate_alpha_gradient(color_mappings['Example_Set_1'], 4, 50)

ax, fig = plt.subplots()
_ = plt.pie(ct_dict.values(), autopct='%.1f%%', colors=colors, labels = ct_dict.keys(),
       wedgeprops={'linewidth': 1.0, 'edgecolor': 'white'}, pctdistance=0.75,
       textprops={'size': 12})

centre_circle = plt.Circle((0, 0), 0.55, fc='white')
fig = plt.gcf()

# Adding Circle in Pie chart
_ = fig.gca().add_artist(centre_circle)

fig.show()

In [None]:
dest_fig_file_stem = 'Example_peaks_homer_anno_piechart'

if SAVE_FIGS:
    fig.savefig(DEST_FIG_DIR_PATH.joinpath(dest_fig_file_stem+".pdf"), format='pdf', dpi=600)
    fig.savefig(DEST_FIG_DIR_PATH.joinpath(dest_fig_file_stem+".jpg"), format='jpg', dpi=600)

<BR>

### ChIPSeeker peaks annotation visualisation

##### Create a file summarizing all the data first

In [None]:
CS_SRC_PATH = Path(r"/path/to/peaks_anno/chipseeker")

In [None]:
features = ["Promoter (<=1kb)", "Promoter (1-2kb)", "Promoter (2-3kb)",
            "5' UTR", "3' UTR", '1st Exon', 'Other Exon', '1st Intron',
            'Other Intron', 'Downstream (<=300)', 'Distal Intergenic']

def fill_missing_features(data: pd.DataFrame):
    actual_features = data['Feature'].unique()

    missing_features = set(features).difference(set(actual_features))

    for f in missing_features:
        tmp = {
            "Feature": f,
            "Frequency": 0
        }
        data = data.append(tmp, ignore_index=True)

    return data

Read in all the ChIPSseeker-generated stats files 
(generated in using a separate R script, included in the repo)

In [None]:
df_columns = ["sample", "feature", "frequency"]

final_df = pd.DataFrame(columns=df_columns)

for file_name in CS_SRC_PATH.glob("*_stats.tsv"):
    # We always put the name of a sample
    # at the begining of the file name.
    sample_name = file_name.name.split("_")[0]
    print(sample_name)
    
    tmp_df = pd.DataFrame(columns=df_columns)
    tmp_feature_data = pd.read_csv(file_name, sep="\t")

    tmp_feature_data = fill_missing_features(tmp_feature_data)

    tmp_df['feature'] = tmp_feature_data['Feature']
    tmp_df['frequency'] = tmp_feature_data['Frequency']
    tmp_df['sample'] = sample_name
    
    final_df = final_df.append(tmp_df, ignore_index=True)
    del tmp_df, tmp_feature_data

In [None]:
# Check all the sample names
final_df['sample'].unique()

Write the data to an Excel file so we do not need to go through the 
above summary again if we just tweak the visualisation part.

In [None]:
final_df.to_excel(CS_SRC_PATH.joinpath("all_sets_anno_summary.xlsx"), index=False)

<BR>

#### Draw a bar plot with features distributions across all samples
##### Distribution of R-loop peaks within genomic region for enDR3-ChIP, enDR3-DRIPc, and other published approaches 

In [None]:
list(color_mappings.keys())

In [None]:
sample_order = ['Example_Set_1', 'Example_Set_2', 'Example_Set_3', 'Example_Set_4']

final_df['sample'] = final_df['sample'].astype("category")
final_df['sample'] = final_df['sample'].cat.set_categories(sample_order[::-1])
final_df = final_df.sort_values(['sample'])

In [None]:
fig_bar, ax = plt.subplots()
bottom = np.zeros(len(final_df['sample'].unique()))

label_data = []

for num, feature in enumerate(features):
    selection = final_df.query("`feature` == @feature")   
    
    alpha_step = 256//len(features)
    alpha = 255 - num*alpha_step
    
    ax.barh(selection['sample'], selection['frequency'], left=bottom, 
            label=feature, edgecolor="w", linewidth=0.5)

    bottom = bottom + np.array(selection['frequency'])
    label_data.append(selection)

ax.spines[['right', 'top']].set_visible(False)
ax.legend(loc="upper right", bbox_to_anchor=(1.4, 1), 
          frameon=False, borderaxespad=0, ncol=1)

In [None]:
if SAVE_FIGS:
    dest_bar_file_name_stem = 'All_sets_peak_anno_barchart'
    fig_bar.savefig(DEST_FIG_DIR_PATH.joinpath(dest_bar_file_name_stem+".pdf"), 
                    format='pdf', dpi=600, bbox_inches='tight')
    fig_bar.savefig(DEST_FIG_DIR_PATH.joinpath(dest_bar_file_name_stem+".jpg"), 
                    format='jpg', dpi=600, bbox_inches='tight', transparent=True)