In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

sns.set(font="Arial", font_scale=1.15, style='ticks')
plt.rc("axes.spines", top=False, right=False)
%matplotlib
%autoindent

hap_sample_palette = {'AK1_Hap1': '#CC0033', 
                      'AK1_Hap2': '#FF0033', 
                      'iPSC_Hap1': '#0066CC', 
                      'iPSC_Hap2': '#0000CC', 
                      'NPC_Hap1': '#33CC00', 
                      'NPC_Hap2': '#66FF66', 
                      'H1': '#40E0D0',
                      'HUES64': '#437299'}

### mCG inside Known ICRs from Imprintome (w/HG002)

In [None]:
df_known = pd.read_table("Known_ICR_cleaned.bed", header=None)
df_known['ICR_region'] = df_known.iloc[:, 0] + ':' + df_known.iloc[:, 1].astype(str) + '-' + df_known.iloc[:, 2].astype(str)
df_known = df_known.set_index('ICR_region')
df_known = df_known[[3]]
df_known.columns = ['Name']

df_ak1_hap1 = pd.read_table("AK1_hap1_KnownICR.tab", index_col=0, header=None)
df_ak1_hap2 = pd.read_table("AK1_hap2_KnownICR.tab", index_col=0, header=None)
df_ak1 = pd.concat([df_ak1_hap1, df_ak1_hap2], axis=1)
df_ak1.index.name = 'ICR_region'
df_ak1.columns = ['AK1_Hap1', 'AK1_Hap2']

df_hg002_hap1 = pd.read_table("HG002_hap1_KnownICR.tab", index_col=0, header=None)
df_hg002_hap2 = pd.read_table("HG002_hap2_KnownICR.tab", index_col=0, header=None)
df_hg002 = pd.concat([df_hg002_hap1, df_hg002_hap2], axis=1)
df_hg002.index.name = 'ICR_region'
df_hg002.columns = ['HG002_Hap1', 'HG002_Hap2']

df_ipsc_hap1 = pd.read_table("iPSC_hap1_KnownICR.tab", index_col=0, header=None)
df_ipsc_hap2 = pd.read_table("iPSC_hap2_KnownICR.tab", index_col=0, header=None)
df_ipsc = pd.concat([df_ipsc_hap1, df_ipsc_hap2], axis=1)
df_ipsc.index.name = 'ICR_region'
df_ipsc.columns = ['iPSC_Hap1', 'iPSC_Hap2']

df_npc_hap1 = pd.read_table("NPC_hap1_KnownICR.tab", index_col=0, header=None)
df_npc_hap2 = pd.read_table("NPC_hap2_KnownICR.tab", index_col=0, header=None)
df_npc = pd.concat([df_npc_hap1, df_npc_hap2], axis=1)
df_npc.index.name = 'ICR_region'
df_npc.columns = ['NPC_Hap1', 'NPC_Hap2']

df_h1 = pd.read_table("H1_HapMerged_KnownICR.tab", index_col=0, header=None)
df_h1.index.name = 'ICR_region'
df_h1.columns = ['H1']

df_hues64 = pd.read_table("HUES64_HapMerged_KnownICR.tab", index_col=0, header=None)
df_hues64.index.name = 'ICR_region'
df_hues64.columns = ['HUES64']

df_known_met = pd.concat([df_known, df_ak1, df_hg002, df_ipsc, df_npc, df_h1, df_hues64], axis=1)
df_known_met = df_known_met.iloc[np.argsort(df_known_met['Name'].apply(lambda x: int(x.split('_')[-1])))]
df_known_met_hm = df_known_met.iloc[:, 1:]
df_known_met_hm['H1_HUES64_mean'] = df_known_met_hm[['H1', 'HUES64']].mean(axis=1)
df_known_met_hm = df_known_met_hm.sort_values(by='H1_HUES64_mean', ascending=False)
del df_known_met_hm['H1_HUES64_mean']

annot = df_known_met_hm.applymap(lambda f: f'{f: .1f}')

fig, ax = plt.subplots(squeeze=False, constrained_layout=True)
# For NA values
heatmap1 = sns.heatmap(
    np.where(df_known_met_hm.isna(), 0, np.nan),
    ax = ax[0, 0],
    cbar = False,
    yticklabels = True,
    annot = np.full_like(df_known_met_hm, "NA", dtype = object),
    fmt = "",
    annot_kws = {"size": 10, "va": "center_baseline", "color": "black"},
    cmap = ListedColormap(['grey']),
    linewidth = 0)
# For non-NA values
heatmap2 = sns.heatmap(
    df_known_met_hm,
    ax = ax[0, 0],
    xticklabels = True,
    yticklabels = True,
    annot = annot, 
    fmt = "",
    annot_kws = {"size": 10, "va": "center_baseline"},
    cmap = "coolwarm",
    linewidth = 0.5, 
    linecolor = "black",
    vmin = 0,
    vmax = 100)
ax[0, 0].xaxis.tick_top()
ax[0, 0].set_ylabel('Known ICR from Imprintome (N = 25)')
heatmap1.set_xlim(0, df_known_met_hm.shape[1] + 0.1)
heatmap1.set_ylim(df_known_met_hm.shape[0] + 0.1, 0)
heatmap2.collections[1].colorbar.set_label(label='DNA methylation (%)', rotation=270, labelpad=10)
plt.savefig('Heatmap_mCG_KnownICRs.png')



### mCG inside Known ICRs from Imprintome (wo/HG002)

In [None]:
df_known = pd.read_table("Known_ICR_cleaned.bed", header=None)
df_known['ICR_region'] = df_known.iloc[:, 0] + ':' + df_known.iloc[:, 1].astype(str) + '-' + df_known.iloc[:, 2].astype(str)
df_known = df_known.set_index('ICR_region')
df_known = df_known[[3]]
df_known.columns = ['Name']

df_known_met_hm = df_known_met.iloc[:, [1, 2, 5, 6, 7, 8, 9, 10]]
df_known_met_hm['H1_HUES64_mean'] = df_known_met_hm[['H1', 'HUES64']].mean(axis=1)
df_known_met_hm = df_known_met_hm.sort_values(by='H1_HUES64_mean', ascending=False)
del df_known_met_hm['H1_HUES64_mean']

annot = df_known_met_hm.applymap(lambda f: f'{f: .1f}')

fig, ax = plt.subplots(squeeze=False, constrained_layout=True)
# For NA values
heatmap1 = sns.heatmap(
    np.where(df_known_met_hm.isna(), 0, np.nan),
    ax = ax[0, 0],
    cbar = False,
    yticklabels = True,
    annot = np.full_like(df_known_met_hm, "NA", dtype = object),
    fmt = "",
    annot_kws = {"size": 10, "va": "center_baseline", "color": "black"},
    cmap = ListedColormap(['grey']),
    linewidth = 0)
# For non-NA values
heatmap2 = sns.heatmap(
    df_known_met_hm,
    ax = ax[0, 0],
    xticklabels = True,
    yticklabels = True,
    annot = annot, 
    fmt = "",
    annot_kws = {"size": 10, "va": "center_baseline"},
    cmap = "coolwarm",
    linewidth = 0.5, 
    linecolor = "black",
    vmin = 0,
    vmax = 100)
ax[0, 0].xaxis.tick_top()
ax[0, 0].set_ylabel('Known ICR from Imprintome (N = 25)')
heatmap1.set_xlim(0, df_known_met_hm.shape[1] + 0.1)
heatmap1.set_ylim(df_known_met_hm.shape[0] + 0.1, 0)
heatmap2.collections[1].colorbar.set_label(label='DNA methylation (%)', rotation=270, labelpad=10)
plt.savefig('Heatmap_mCG_KnownICRs_woHG002.png')



### mCG inside Putative ICRs from Imprintome (w/ HG002)

In [None]:
df_putative = pd.read_table("Putative_ICR_wochrX.bed", header=None)
df_putative['ICR_region'] = df_putative.iloc[:, 0] + ':' + df_putative.iloc[:, 1].astype(str) + '-' + df_putative.iloc[:, 2].astype(str)
df_putative = df_putative.set_index('ICR_region')
df_putative = df_putative[[3]]
df_putative.columns = ['Name']

df_ak1_hap1 = pd.read_table("AK1_hap1_PutativeICR.tab", index_col=0, header=None)
df_ak1_hap2 = pd.read_table("AK1_hap2_PutativeICR.tab", index_col=0, header=None)
df_ak1 = pd.concat([df_ak1_hap1, df_ak1_hap2], axis=1)
df_ak1.index.name = 'ICR_region'
df_ak1.columns = ['AK1_Hap1', 'AK1_Hap2']

df_hg002_hap1 = pd.read_table("HG002_hap1_PutativeICR.tab", index_col=0, header=None)
df_hg002_hap2 = pd.read_table("HG002_hap2_PutativeICR.tab", index_col=0, header=None)
df_hg002 = pd.concat([df_hg002_hap1, df_hg002_hap2], axis=1)
df_hg002.index.name = 'ICR_region'
df_hg002.columns = ['HG002_Hap1', 'HG002_Hap2']

df_ipsc_hap1 = pd.read_table("iPSC_hap1_PutativeICR.tab", index_col=0, header=None)
df_ipsc_hap2 = pd.read_table("iPSC_hap2_PutativeICR.tab", index_col=0, header=None)
df_ipsc = pd.concat([df_ipsc_hap1, df_ipsc_hap2], axis=1)
df_ipsc.index.name = 'ICR_region'
df_ipsc.columns = ['iPSC_Hap1', 'iPSC_Hap2']

df_npc_hap1 = pd.read_table("NPC_hap1_PutativeICR.tab", index_col=0, header=None)
df_npc_hap2 = pd.read_table("NPC_hap2_PutativeICR.tab", index_col=0, header=None)
df_npc = pd.concat([df_npc_hap1, df_npc_hap2], axis=1)
df_npc.index.name = 'ICR_region'
df_npc.columns = ['NPC_Hap1', 'NPC_Hap2']

df_h1 = pd.read_table("H1_HapMerged_PutativeICR.tab", index_col=0, header=None)
df_h1.index.name = 'ICR_region'
df_h1.columns = ['H1']

df_hues64 = pd.read_table("HUES64_HapMerged_PutativeICR.tab", index_col=0, header=None)
df_hues64.index.name = 'ICR_region'
df_hues64.columns = ['HUES64']

df_putative_met = pd.concat([df_putative, df_ak1, df_hg002, df_ipsc, df_npc, df_h1, df_hues64], axis=1)
df_putative_met = df_putative_met.iloc[np.argsort(df_putative_met['Name'].apply(lambda x: int(x.split('_')[-1])))]
df_putative_met_hm = df_putative_met.iloc[:, 1:]
df_putative_met_hm['H1_HUES64_mean'] = df_putative_met_hm[['H1', 'HUES64']].mean(axis=1)
df_putative_met_hm = df_putative_met_hm.sort_values(by='H1_HUES64_mean', ascending=False)
del df_putative_met_hm['H1_HUES64_mean']

annot = df_putative_met_hm.applymap(lambda f: f'{f: .1f}')

fig, ax = plt.subplots(squeeze=False, constrained_layout=True)
# For NA values
heatmap1 = sns.heatmap(
    np.where(df_putative_met_hm.isna(), 0, np.nan),
    ax = ax[0, 0],
    cbar = False,
    xticklabels = True,
    yticklabels = False,
    annot = None,
    fmt = "",
    cmap = ListedColormap(['grey']),
    linewidth = 0)
# For non-NA values
heatmap2 = sns.heatmap(
    df_putative_met_hm,
    ax = ax[0, 0],
    xticklabels = True,
    yticklabels = False,
    annot = None, 
    fmt = "",
    cmap = "coolwarm",
    linewidth = 0.001, 
    linecolor = "black",
    vmin = 0,
    vmax = 100)
ax[0, 0].xaxis.tick_top()
ax[0, 0].set_ylabel('Putative ICR from Imprintome (N = 1,390)')
heatmap2.collections[1].colorbar.set_label(label='DNA methylation (%)', rotation=270, labelpad=10)
plt.savefig('Heatmap_mCG_PutativeICRs.png')



### mCG inside Putative ICRs from Imprintome (wo/ HG002)

In [None]:
df_putative_met_hm = df_putative_met_hm.iloc[:, [0, 1, 4, 5, 6, 7, 8, 9]]

fig, ax = plt.subplots(squeeze=False, constrained_layout=True)
# For NA values
heatmap1 = sns.heatmap(
    np.where(df_putative_met_hm.isna(), 0, np.nan),
    ax = ax[0, 0],
    cbar = False,
    xticklabels = True,
    yticklabels = False,
    annot = None,
    fmt = "",
    cmap = ListedColormap(['grey']),
    linewidth = 0)
# For non-NA values
heatmap2 = sns.heatmap(
    df_putative_met_hm,
    ax = ax[0, 0],
    xticklabels = True,
    yticklabels = False,
    annot = None, 
    fmt = "",
    cmap = "coolwarm",
    linewidth = 0.001, 
    linecolor = "black",
    vmin = 0,
    vmax = 100)
ax[0, 0].xaxis.tick_top()
ax[0, 0].set_ylabel('Putative ICR from Imprintome (N = 1,390)')
heatmap2.collections[1].colorbar.set_label(label='DNA methylation (%)', rotation=270, labelpad=10)
plt.savefig('Heatmap_mCG_PutativeICRs_woHG002.png')



### mCG inside novel AK1 candidate ICRs (w/ HG002)

In [None]:
df_candidate_novelICR = pd.read_table("Candidate_ICR_AK1_Gene_Association_NOVEL.bed", header=None)
df_candidate_novelICR['ICR_region'] = df_candidate_novelICR.iloc[:, 0] + ':' + df_candidate_novelICR.iloc[:, 1].astype(str) + '-' + df_candidate_novelICR.iloc[:, 2].astype(str)
df_candidate_novelICR = df_candidate_novelICR.set_index('ICR_region')
df_candidate_novelICR = df_candidate_novelICR[[3]]
df_candidate_novelICR.columns = ['Name']

df_ak1_hap1 = pd.read_table("AK1_hap1_AK1ICR.tab", index_col=0, header=None)
df_ak1_hap2 = pd.read_table("AK1_hap2_AK1ICR.tab", index_col=0, header=None)
df_ak1 = pd.concat([df_ak1_hap1, df_ak1_hap2], axis=1)
df_ak1.index.name = 'ICR_region'
df_ak1.columns = ['AK1_Hap1', 'AK1_Hap2']

df_hg002_hap1 = pd.read_table("HG002_hap1_AK1ICR.tab", index_col=0, header=None)
df_hg002_hap2 = pd.read_table("HG002_hap2_AK1ICR.tab", index_col=0, header=None)
df_hg002 = pd.concat([df_hg002_hap1, df_hg002_hap2], axis=1)
df_hg002.index.name = 'ICR_region'
df_hg002.columns = ['HG002_Hap1', 'HG002_Hap2']

df_ipsc_hap1 = pd.read_table("iPSC_hap1_AK1ICR.tab", index_col=0, header=None)
df_ipsc_hap2 = pd.read_table("iPSC_hap2_AK1ICR.tab", index_col=0, header=None)
df_ipsc = pd.concat([df_ipsc_hap1, df_ipsc_hap2], axis=1)
df_ipsc.index.name = 'ICR_region'
df_ipsc.columns = ['iPSC_Hap1', 'iPSC_Hap2']

df_npc_hap1 = pd.read_table("NPC_hap1_AK1ICR.tab", index_col=0, header=None)
df_npc_hap2 = pd.read_table("NPC_hap2_AK1ICR.tab", index_col=0, header=None)
df_npc = pd.concat([df_npc_hap1, df_npc_hap2], axis=1)
df_npc.index.name = 'ICR_region'
df_npc.columns = ['NPC_Hap1', 'NPC_Hap2']

df_h1 = pd.read_table("H1_HapMerged_AK1ICR.tab", index_col=0, header=None)
df_h1.index.name = 'ICR_region'
df_h1.columns = ['H1']

df_hues64 = pd.read_table("HUES64_HapMerged_AK1ICR.tab", index_col=0, header=None)
df_hues64.index.name = 'ICR_region'
df_hues64.columns = ['HUES64']

df_ak1novel_met = pd.concat([df_candidate_novelICR, df_ak1, df_hg002, df_ipsc, df_npc, df_h1, df_hues64], axis=1)
df_ak1novel_met_hm = df_ak1novel_met.iloc[:, 1:]
df_ak1novel_met_hm['H1_HUES64_mean'] = df_ak1novel_met_hm[['H1', 'HUES64']].mean(axis=1)
df_ak1novel_met_hm = df_ak1novel_met_hm.sort_values(by='H1_HUES64_mean', ascending=False)
del df_ak1novel_met_hm['H1_HUES64_mean']

annot = df_ak1novel_met_hm.applymap(lambda f: f'{f: .1f}')

fig, ax = plt.subplots(squeeze=False, constrained_layout=True)
# For NA values
heatmap1 = sns.heatmap(
    np.where(df_ak1novel_met_hm.isna(), 0, np.nan),
    ax = ax[0, 0],
    cbar = False,
    xticklabels = True,
    yticklabels = False,
    annot = None,
    fmt = "",
    cmap = ListedColormap(['grey']),
    linewidth = 0)
# For non-NA values
heatmap2 = sns.heatmap(
    df_ak1novel_met_hm,
    ax = ax[0, 0],
    xticklabels = True,
    yticklabels = False,
    annot = None, 
    fmt = "",
    cmap = "coolwarm",
    linewidth = 0.001, 
    linecolor = "black",
    vmin = 0,
    vmax = 100)
ax[0, 0].xaxis.tick_top()
ax[0, 0].set_xticklabels(ax[0, 0].get_xticklabels(), rotation=45)
ax[0, 0].set_ylabel('Novel ICRs in AK1 (N = 609)')
heatmap2.collections[1].colorbar.set_label(label='DNA methylation (%)', rotation=270, labelpad=10)
plt.savefig('Heatmap_mCG_AK1novelICRs_1.png')





df_ak1novel_met = pd.concat([df_candidate_novelICR, df_ak1, df_hg002, df_ipsc, df_npc, df_h1, df_hues64], axis=1)
df_ak1novel_met_hm = df_ak1novel_met.iloc[:, 1:]
df_ak1novel_met_hm = df_ak1novel_met_hm.sort_values(by='AK1_Hap1', ascending=False)

annot = df_ak1novel_met_hm.applymap(lambda f: f'{f: .1f}')

fig, ax = plt.subplots(squeeze=False, constrained_layout=True)
# For NA values
heatmap1 = sns.heatmap(
    np.where(df_ak1novel_met_hm.isna(), 0, np.nan),
    ax = ax[0, 0],
    cbar = False,
    xticklabels = True,
    yticklabels = False,
    annot = None,
    fmt = "",
    cmap = ListedColormap(['grey']),
    linewidth = 0)
# For non-NA values
heatmap2 = sns.heatmap(
    df_ak1novel_met_hm,
    ax = ax[0, 0],
    xticklabels = True,
    yticklabels = False,
    annot = None, 
    fmt = "",
    cmap = "coolwarm",
    linewidth = 0.001, 
    linecolor = "black",
    vmin = 0,
    vmax = 100)
ax[0, 0].xaxis.tick_top()
ax[0, 0].set_xticklabels(ax[0, 0].get_xticklabels(), rotation=45)
ax[0, 0].set_ylabel('Novel ICRs in AK1 (N = 609)')
heatmap2.collections[1].colorbar.set_label(label='DNA methylation (%)', rotation=270, labelpad=10)
plt.savefig('Heatmap_mCG_AK1novelICRs_2.png')



### mCG inside novel AK1 candidate ICRs (wo/ HG002)

In [None]:
df_ak1novel_met = pd.concat([df_candidate_novelICR, df_ak1, df_hg002, df_ipsc, df_npc, df_h1, df_hues64], axis=1)
df_ak1novel_met_hm = df_ak1novel_met.iloc[:, 1:]
df_ak1novel_met_hm = df_ak1novel_met_hm.sort_values(by='AK1_Hap1', ascending=False)
df_ak1novel_met_hm = df_ak1novel_met_hm.iloc[:, [0, 1, 4, 5, 6, 7, 8, 9]]

annot = df_ak1novel_met_hm.applymap(lambda f: f'{f: .1f}')

fig, ax = plt.subplots(squeeze=False, constrained_layout=True)
# For NA values
heatmap1 = sns.heatmap(
    np.where(df_ak1novel_met_hm.isna(), 0, np.nan),
    ax = ax[0, 0],
    cbar = False,
    xticklabels = True,
    yticklabels = False,
    annot = None,
    fmt = "",
    cmap = ListedColormap(['grey']),
    linewidth = 0)
# For non-NA values
heatmap2 = sns.heatmap(
    df_ak1novel_met_hm,
    ax = ax[0, 0],
    xticklabels = True,
    yticklabels = False,
    annot = None, 
    fmt = "",
    cmap = "coolwarm",
    linewidth = 0.001, 
    linecolor = "black",
    vmin = 0,
    vmax = 100)
ax[0, 0].xaxis.tick_top()
ax[0, 0].set_xticklabels(ax[0, 0].get_xticklabels(), rotation=45)
ax[0, 0].set_ylabel('Novel ICRs in AK1 (N = 609)')
heatmap2.collections[1].colorbar.set_label(label='DNA methylation (%)', rotation=270, labelpad=10)
plt.savefig('Heatmap_mCG_AK1novelICRs_2_woHG002.png')

### PCA

In [None]:
# dropna
df_ak1novel_met_pca = df_ak1novel_met_hm.dropna(axis=0, how='any')
df_ak1novel_met_pca = df_ak1novel_met_hm.dropna(axis=1, how='any')

df_ak1novel_met_pca_normalized = (df_ak1novel_met_pca - df_ak1novel_met_pca.mean()) / df_ak1novel_met_pca.std()
pca = PCA(n_components=2)
pca.fit(df_ak1novel_met_pca_normalized.T)
transformed_data = pca.transform(df_ak1novel_met_pca_normalized.T)

sns.set(font="Arial", font_scale=0.9, style='ticks')
plt.rc("axes.spines", top=False, right=False)
fig, ax = plt.subplots(figsize=(7, 7), constrained_layout=True)
for i in range(len(transformed_data)):
    ax.scatter(transformed_data[i][0], transformed_data[i][1],
               color = hap_sample_palette[df_ak1novel_met_pca.columns[i]],
               s = 200)
    ax.annotate(df_ak1novel_met_pca.columns[i], 
                (transformed_data[i][0], transformed_data[i][1]), 
                xytext = (5,5), 
                textcoords = 'offset points')
    
ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}%)')
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}%)')
sns.set(font="Arial", font_scale=1.15, style='ticks') # Back to as it was
plt.savefig('PCA_mCG_AK1novelICRs_woHG002.png')