In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.colors import ListedColormap
from sklearn.decomposition import PCA

#cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["#104e8b", "#ffdab9", "#8b0a50"])
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["#FFFF00", "#000000", "#0066CC"])

sns.set_theme(font="Arial", font_scale=1.15, style='ticks')
plt.rc("axes.spines", top=False, right=False)
%matplotlib
%autoindent

### Reading Whatshap stats results

In [None]:
hifi = pd.read_table("AK1_PASS_Phased_wIndels_PS_woMXY.tsv", index_col=0) # whatshap stats results
hifi.index = ['AK1_HiFi' if idx=='AK1' else idx for idx in hifi.index]
hifi = hifi.reset_index()
illumina = pd.read_table("AK1_WGS.Filtered.Variants.PASS.reheader.phased_PS_woMXY_stats.tsv", index_col=0) # whatshap stats results
illumina.index = ['AK1_Illumina' if idx=='AK1' else idx for idx in illumina.index]
illumina = illumina.reset_index()

### Create Bar plot for each items

In [None]:
for column in ['phased', 'blocks', 'bp_per_block_avg']:
    df = pd.concat([hifi[['index', 'chromosome', column]], illumina[['index', 'chromosome', column]]], axis=0)

    fig = plt.figure(figsize=(16.5, 8), constrained_layout=True)
    grid = gridspec.GridSpec(1, 2, width_ratios=[3, 1], figure=fig)
    ax1 = plt.subplot(grid[0])
    df_chr = df[df['chromosome'] != 'ALL']
    sns.barplot(data=df_chr, x='chromosome', y=column, hue='index', palette={'AK1_HiFi': 'darkred', 'AK1_Illumina': 'pink'}, ax=ax1)
    ax1.set_xlabel('GRCh38 Autosomes')
    if column == 'phased':
        ax1.set_ylabel('The Number of Phased Variants')
    elif column == 'blocks':
        ax1.set_ylabel('The Number of Phased Blocks')
    elif column == 'bp_per_block_avg':
        ax1.set_ylabel('The Average Length of Phased Blocks (bp)')
    ax1.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    handles, _ = ax1.get_legend_handles_labels()
    if column == 'bp_per_block_avg':
        ax1.legend(handles, ['AK1 HiFi (26X)', 'AK1 Illumina (70X)'], loc='upper right', bbox_to_anchor=(0.3, 0.9), frameon=True, fancybox=False, edgecolor='black', prop={'size':14}, title=None)
    else:
        ax1.legend(handles, ['AK1 HiFi (26X)', 'AK1 Illumina (70X)'], loc='upper right', bbox_to_anchor=(0.95, 0.9), frameon=True, fancybox=False, edgecolor='black', prop={'size':14}, title=None)

    ax2 = plt.subplot(grid[1])
    df_all = df[df['chromosome'] == 'ALL']
    sns.barplot(data=df_all, x='chromosome', y=column, hue='index', palette={'AK1_HiFi': 'darkred', 'AK1_Illumina': 'pink'}, width=0.5, ax=ax2)
    ax2.set_xlabel('')
    ax2.set_xticklabels(['All GRCh38 Autosomes'], size=15)
    if column == 'phased':
        ax2.set_ylabel('The Number of Phased Variants')
    elif column == 'blocks':
        ax2.set_ylabel('The Number of Phased Blocks')
    elif column == 'bp_per_block_avg':
        ax2.set_ylabel('The Average Length of Phased Blocks (bp)')
    ax2.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    ax2.legend().set_visible(False)

    plt.savefig(f"{column}_barplot.pdf")


