### Peaks GC content visualisation

The GC content was calculated using `bedtools nuc`

In [None]:
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd

import matplotlib as mpl
import seaborn as sns

mpl.rcParams['font.sans-serif'] = ['Arial']
mpl.rcParams['font.family'] = 'sans-serif'

# Make SVG text as font not as curves
mpl.rcParams['svg.fonttype'] = 'none'

In [None]:
SAVE_FIGS = True

In [None]:
# Source directory where all the files generated by `bedtools nuc` are located
SRC_DIR_PATH = Path(r"/path/to/gc_content")

DEST_DIR_PATH = Path(r"/path/to/gc_content/plots")

In [None]:
final_df = pd.DataFrame(columns=['gc', 'sample'])

# We had our files with .txt extension.
for file_path in SRC_DIR_PATH.glob("*.txt"):
    name_components = file_path.name.split('_')
    sample_name = name_components[0] + " " + name_components[1]
    print(sample_name)
    
    tmp_peaks_data = pd.read_csv(SRC_DIR_PATH.joinpath(file_path), sep='\t', decimal='.', header=0)
    
    # Find an index of a column containing GC ratio
    gc_col_idx = [i for i, element in enumerate(list(tmp_peaks_data.columns)) if element.endswith('_pct_gc')][0]
    
    tmp_df = pd.DataFrame(columns=['gc', 'sample'])
    tmp_df['gc'] = tmp_peaks_data.iloc[:, gc_col_idx]
    tmp_df['sample'] = sample_name
    
    final_df = pd.concat([final_df, tmp_df], axis=0, ignore_index=True)
 
    del tmp_df, tmp_peaks_data

In [None]:
final_df.groupby('sample').describe()

In [None]:
data_to_plot = final_df

# Or select a subset to plot
# samples_to_plot = []
# data_to_plot = final_df.loc[final_df['sample'].isin(samples_to_plot)]

In [None]:
# The file names must match the keys in the mapping dictionary
color_mappings = {
    'Example_Set_1': '#3C5488',
    'Example_Set_2': '#00A087',
    'Example_Set_3': '#E64B35',
    'Example_Set_4': '#8491B4',
}

colors = -1

In [None]:
all_sample_order = color_mappings.keys()

In [None]:
sns.set(rc={'figure.figsize': (6, 3.25)})
sns.set_style("ticks")
kde_plot = sns.kdeplot(data=data_to_plot, x=final_df['gc']*100, 
                       hue='sample', palette=color_mappings, 
                       common_norm=False, fill=False, 
                       hue_order = all_sample_order
                      )
kde_plot.axvline(40.9, linestyle='--', linewidth=0.5, color='green') # human genome average GC content
kde_plot.set_xlabel('GC content [%]')
sns.move_legend(kde_plot, "upper left", bbox_to_anchor=(1.025, 1.03))
kde_plot.get_figure().tight_layout()

In [None]:
plot_file_name_stem = 'Example_GC_content_summary_lines'
if SAVE_FIGS:
    kde_plot.get_figure().savefig(
        DEST_DIR_PATH.joinpath(plot_file_name+'.pdf'), format='pdf', dpi=600
    )
    kde_plot.get_figure().savefig(
        DEST_DIR_PATH.joinpath(plot_file_name+'.jpg'), format='jpg', dpi=600, transparent=True
    )