### Peaks genome coverage and peaks width distribution
##### Code snipppets for generating and visualisation of peaks genome coverage and peaks width distribution

In [None]:
from pathlib import Path

import numpy as np

import matplotlib.pyplot as plt
import pandas as pd

import matplotlib as mpl
import matplotlib.ticker as mticker
import seaborn as sns

# mpl.use('Agg')

mpl.rcParams['font.sans-serif'] = ['Arial']
mpl.rcParams['font.family'] = 'sans-serif'

# Make SVG text as font not as curves
mpl.rcParams['svg.fonttype'] = 'none'

#### Peaks summed length and genome coverage from peak files

In [None]:
# The list of directories containing .narrowPeak/.broadPeak files
SRC_DIR_LIST = [
    r"/path/to/peaks/example_set_1",
    r"/path/to/peaks/example_set_2",
    r"/path/to/peaks/example_set_3",
    r"/path/to/peaks/example_set_4",
]

In [None]:
DEST_DIR_PATH = Path(r"/path/to/peaks/analysis")

In [None]:
def get_peaks_width_sum(data: pd.DataFrame) -> int:
    peak_widths = data.iloc[:, 2] - data.iloc[:, 1]
    return peak_widths.sum()

def get_peaks_mean_width(data: pd.DataFrame) -> float:
    peak_widths = data.iloc[:, 2] - data.iloc[:, 1]
    return peak_widths.mean()

# Female (we waroking with datasets from HeLa/HEK293 cells)
HUMAN_GENOME_SIZE = 3_054_815_472

def get_peaks_genome_coverage(peaks_width_sum: int, genome_size: int) -> int:
    return (pw_sum/genome_size)*100

def get_peaks_count(data: pd.DataFrame) -> int:
    return len(data)

In [None]:
data_dict = {}
cnt = 0
for dir_path in SRC_DIR_LIST:
    for file_name in (
        p for p in Path(dir_path).glob("**/*") if p.suffix in {".narrowPeak", ".broadPeak"}
    ):
        tmp_dict = {}
        # print(file_name)
        peaks_data = pd.read_csv(file_name, sep='\t', decimal='.', header=0)
        name_parts = file_name.name.split('_')
        sample_name = name_parts[0] + " " + name_parts[1]
        
        p_count = get_peaks_count(peaks_data)
        p_mean_width = get_peaks_mean_width(peaks_data)
        pw_sum = get_peaks_width_sum(peaks_data)
        pg_coverage = get_peaks_genome_coverage(pw_sum, HUMAN_GENOME_SIZE)
        
        tmp_dict = {
            "sample_name": sample_name,
            "peak_count": p_count,
            "peak_mean_width": p_mean_width,
            "peaks_width_sum": pw_sum,
            "peaks_genome_coverage": pg_coverage
        }
        
        data_dict[cnt] = tmp_dict
        del tmp_dict
        cnt += 1
        print(f"{sample_name} - {p_count} - {round(p_mean_width, 2)} - {pw_sum} - {round(pg_coverage, 2)}%")

In [None]:
peak_data = pd.DataFrame.from_dict(data_dict, orient='index')

In [None]:
peak_data

Save the data to an Excel file so we do need to re-do the summary every time we tweak plots

In [None]:
result_file_name = "Peak_width_summary.xlsx"
peak_data.to_excel(DEST_DIR_PATH/result_file_name, index=False)

In [None]:
DEST_FIGS_DIR_PATH = Path(r"/path/to/figures/peaks_genome_coverage/")

In [None]:
sns.set(rc={'figure.figsize': (6, 3.5)})
sns.set_style("ticks")
bp = sns.barplot(data=peak_data, x="peaks_genome_coverage", 
            y="sample_name", orient="h", palette=color_mappings)
bp.set_xlabel('Peaks genome goverage [%]')
bp.get_figure().tight_layout()

In [None]:
plot_file_name = 'Peak_coverage_all'
bp.get_figure().savefig(DEST_FIGS_DIR_PATH.joinpath(plot_file_name+'.pdf'), 
                              format='pdf', dpi=600)
bp.get_figure().savefig(DEST_FIGS_DIR_PATH.joinpath(plot_file_name+'.jpg'), 
                              format='jpg', dpi=600, transparent=True)

#### Peaks width distribution
The GC content files generated with `bedtools nuc` were used again.
These files include also information about the length of each peak.

In [None]:
# Source directory where all the files generated by `bedtools nuc` are located
SRC_DIR_PATH = Path(r"/path/to/gc_content")

DEST_DIR_PATH = Path(r"/path/to/figures/peak_width_distribution")

In [None]:
final_df = pd.DataFrame(columns=['peak_width', 'sample'])

for file_path in SRC_DIR_PATH.glob("*.txt"):
    # print(file_path)
    name_parts = file_path.name.split('_')
    sample_name = name_parts[0] + " " + name_parts[1]
    print(sample_name)
    
    tmp_peaks_data = pd.read_csv(SRC_DIR_PATH.joinpath(file_path), sep='\t', decimal='.', header=0)
    
    # Find an index of a column containing sequence length
    gc_col_idx = [i for i, element in enumerate(list(tmp_peaks_data.columns)) if element.endswith('_seq_len')][0]
    
    tmp_df = pd.DataFrame(columns=['peak_width', 'sample'])
    tmp_df['peak_width'] = tmp_peaks_data.iloc[:, gc_col_idx]
    tmp_df['sample'] = sample_name
    
    final_df = pd.concat([final_df, tmp_df], axis=0, ignore_index=True)
 
    del tmp_df, tmp_peaks_data

In [None]:
final_df['sample'].unique()

In [None]:
data_to_plot = final_df

# Or select a subset to plot
# samples_to_plot = []
# data_to_plot = final_df.loc[final_df['sample'].isin(samples_to_plot)]

In [None]:
# The file names must match the keys in the mapping dictionary
color_mappings = {
    'Example_Set_1': '#3C5488',
    'Example_Set_2': '#00A087',
    'Example_Set_3': '#E64B35',
    'Example_Set_4': '#8491B4',
}

colors = -1

In [None]:
sns.set(rc={'figure.figsize': (5, 3)})
sns.set_style("ticks")
kde_plot = sns.kdeplot(data=data_to_plot, x=data_to_plot['peak_width'], hue='sample', 
                       palette=color_mappings, hue_order=samples_to_plot,
                       common_norm=False, fill=False, log_scale=True)

locmin = mticker.LogLocator(base=10, subs=np.arange(0.1, 1, 0.1), numticks=10)
kde_plot.xaxis.set_minor_locator(locmin)

kde_plot.set_xlabel('log$_{10}$(peak witdh)')
sns.move_legend(kde_plot, "upper left", bbox_to_anchor=(1, 1), title="Data set")

kde_plot.get_figure().tight_layout()

In [None]:
plot_file_name = 'Peak_width_dist_lines'
kde_plot.get_figure().savefig(DEST_DIR_PATH.joinpath(plot_file_name+'.pdf'), 
                              format='pdf', dpi=600, transparent=True)
kde_plot.get_figure().savefig(DEST_DIR_PATH.joinpath(plot_file_name+'.jpg'), 
                              format='jpg', dpi=600, transparent=True)
plt.close()