In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["#FFFF00", "#000000", "#0066CC"])

sns.set_theme(font="Arial", font_scale=1.15, style='ticks')
plt.rc("axes.spines", top=False, right=False)
#%matplotlib inline
sample_palette = {'AK1':'#FF0000', 'iPSC':'#154360', 'H1': '#40E0D0', 'HUES64': '#437299', 'NPC':'#229954', 'HG002': '#545454'}

### Input File

In [None]:
#dir="/Users/mhryansohn/Desktop/01.Workspace/01.Projects/03.AK1-PacBio/01.DNA/Merged_Run/DNA_methylation/PMD_from_AK1_WGBS/"
dir="/mnt/data/Projects/phenomata/01.Projects/13.AK1_PacBio/01.DNA/Merged/Phasing_wIndels/MethPipe/PMD/"
ID, Length = list(), list()
with open(f"{dir}AK1_PMD_HiFi.bed", 'r') as pmdf:
    for line in pmdf:  
        line = line.strip('\n').split('\t')
    
        id = f"{line[0]}:{line[1]}-{line[2]}_AK1"
        length = int(line[2]) - int(line[1])
    
        ID.append(id)
        Length.append(length)

pmd_from_ak1_hifi = pd.DataFrame(list(zip(ID, Length)), columns=['ID', 'Length']).set_index('ID')
del ID, Length

ID, Length = list(), list()
with open(f"{dir}HG002_PMD_HiFi.bed", 'r') as pmdf:
    for line in pmdf:  
        line = line.strip('\n').split('\t')
    
        id = f"{line[0]}:{line[1]}-{line[2]}_HG002"
        length = int(line[2]) - int(line[1])
    
        ID.append(id)
        Length.append(length)

pmd_from_hg002_hifi = pd.DataFrame(list(zip(ID, Length)), columns=['ID', 'Length']).set_index('ID')
del ID, Length

pmd_total = pd.concat([pmd_from_ak1_hifi, pmd_from_hg002_hifi], axis=0)
pmd_total = pmd_total.assign(Sample=pmd_total.index.str.split('_').str[-1])

'''
fig, ax = plt.subplots(figsize=(8,8))
sns.histplot(data=pmd_from_ak1_wgbs, x='Length', kde=True, stat='count', ax=ax)
ax.set_xlim(pmd_from_ak1_wgbs['Length'].min(), pmd_from_ak1_wgbs['Length'].max())
ax.set_xlabel("Length of PMD (bp)")
sns.despine(ax=ax)
'''

fig, ax = plt.subplots(figsize=(8,8), constrained_layout=True)
hist = sns.histplot(data=pmd_total, x='Length', hue='Sample', kde=True, stat='count', palette=sample_palette, ax=ax)
ax.set_xlim(pmd_total['Length'].min(), pmd_total['Length'].max())
ax.set_xlabel(f"Length of PMD")
ax.set_ylabel("The Number of PMD")
legend = ax.legend_
handles = legend.legendHandles
labels = [text.get_text() for text in legend.texts]

ax.legend(handles, labels, loc='upper right', bbox_to_anchor=(0.9, 0.8), frameon=True, fancybox=False, edgecolor='black', prop={'size':15}, title=None)
plt.savefig('PMD_Length_Distribution_AK1_HG002.png')


### PMD (from AK1 HiFi) mCG Merging

In [None]:
dir='/mnt/data/Projects/phenomata/01.Projects/13.AK1_PacBio/01.DNA/Merged/Phasing_wIndels/MethPipe/PMD/'

ak1 = pd.read_table(f"{dir}AK1_AK1_HiFi_PMD_met.txt", index_col=0)
#hg002 = pd.read_table(f"{dir}HG002_merged_AK1_WGBS_PMD_met.txt", index_col=0)
#hg002 = hg002.rename(columns={'HG002_merged': 'HG002'})
hg002 = pd.read_table(f"{dir}HG002_AK1_HiFi_PMD_met.txt", index_col=0)
ipsc = pd.read_table(f"{dir}iPSC_AK1_HiFi_PMD_met.txt", index_col=0)
h1 = pd.read_table(f"{dir}H1_AK1_HiFi_PMD_met.txt", index_col=0)
hues64 = pd.read_table(f"{dir}HUES64_AK1_HiFi_PMD_met.txt", index_col=0)
npc = pd.read_table(f"{dir}NPC_AK1_HiFi_PMD_met.txt", index_col=0)
merge = pd.concat([ak1, hg002, ipsc, h1, hues64, npc], axis=1)

merge = merge[~merge.index.str.startswith('chrY')]

### Analysis

In [None]:
# Main Figure

sns.set_theme(font="Arial", font_scale=1.5, style='ticks')
plt.rc("axes.spines", top=False, right=False)

fig, ax = plt.subplots(figsize=(10,15), constrained_layout=True)
heatmap = sns.heatmap(merge.iloc[:, [0, 2, 3, 4, 5]], 
                      vmin=0, 
                      vmax=100, 
                      xticklabels=True, 
                      yticklabels=False, 
                      cbar=True, 
                      cbar_kws={'label':'DNA methylation (%)'}, 
                      cmap=cmap, ax=ax)
heatmap.collections[0].colorbar.set_label(label='DNA methylation (%)', rotation=270, labelpad=20)
ax.set_ylabel('PMDs from AK1 HiFi (N=2,087)', labelpad=15)
for separator in [1, 2, 4]:
    heatmap.add_patch(plt.Rectangle((separator, 0), 0.05, merge.shape[0], fill=True, color="white", lw=0))
    
plt.savefig('PMD_AK1_HiFi_mCG_Heatmap.png')

# Main Figure (w/ HG002)
fig, ax = plt.subplots(figsize=(10,15))
heatmap = sns.heatmap(merge, 
                      vmin=0, 
                      vmax=100, 
                      xticklabels=True, 
                      yticklabels=False, 
                      cbar=True, 
                      cbar_kws={'label':'DNA methylation (%)'}, 
                      cmap=cmap, ax=ax)
heatmap.collections[0].colorbar.set_label(label='DNA methylation (%)', rotation=270, labelpad=20)
ax.set_ylabel('PMDs from AK1 HiFi (N=2,087)', labelpad=15)
plt.savefig('PMD_AK1_HiFi_mCG_Heatmap_wHG002.png')

sns.set(font="Arial", font_scale=1.15, style='ticks')
plt.rc("axes.spines", top=False, right=False)


### 100kb DNA methylation

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sns.set(font="Arial", font_scale=2, style='ticks')

#df1 = pd.read_table('100kb_individual/AK1_100kbmet.tab', index_col=0, header=None)
#df2 = pd.read_table('100kb_individual/iPSC_100kbmet.tab', index_col=0, header=None)
#df3 = pd.read_table('100kb_individual/H1_100kbmet.tab', index_col=0, header=None)
#df4 = pd.read_table('100kb_individual/NPC_100kbmet.tab', index_col=0, header=None)
#

#df = pd.concat([df1, df2, df3, df4], axis=1)

# Main Figure (Fig.1 Violinplot)
ak1_100kb = pd.read_table('AK1_100kbmet.tab', index_col=0, header=None)
hg002_100kb = pd.read_table('HG002_100kbmet.tab', index_col=0, header=None) # Added For Fig.S
ipsc_100kb = pd.read_table('iPSC_100kbmet.tab', index_col=0, header=None)
h1_100kb = pd.read_table('H1_100kbmet.tab', index_col=0, header=None)
hues64_100kb = pd.read_table('HUES64_100kbmet.tab', index_col=0, header=None) # Added 20230703
npc_100kb = pd.read_table('NPC_100kbmet.tab', index_col=0, header=None)

df = pd.concat([ak1_100kb, ipsc_100kb, h1_100kb, hues64_100kb, npc_100kb], axis=1)
df.columns = ['AK1', 'iPSC', 'H1', 'HUES64', 'NPC']
df.index.name = 'Region'
df = df.dropna()

fig, axes = plt.subplots(1, 1, figsize=(8, 5), constrained_layout=True)
sns.violinplot(data=df, palette=sample_palette, ax=axes)
axes.set_ylabel('100kb mCG Level (%)')
sns.despine(ax=axes)
#plt.savefig('Violin_100kb_met.png')

# Suppl Figure (Fig.SX Violinplot)
df = pd.concat([ak1_100kb, hg002_100kb, ipsc_100kb, h1_100kb, hues64_100kb, npc_100kb], axis=1)
df.columns = ['AK1', 'HG002', 'iPSC', 'H1', 'HUES64', 'NPC']
df.index.name = 'Region'
df = df.dropna()

fig, axes = plt.subplots(1, 1, figsize=(8, 5), constrained_layout=True)
sns.violinplot(data=df, palette=sample_palette, ax=axes)
axes.set_ylabel('100kb mCG Level (%)')
sns.despine(ax=axes)
#plt.savefig('Violin_100kb_met_wHG002.png')

### 10kb DNA methylation

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(font="Arial", font_scale=2, style='ticks')
plt.rc("axes.spines", top=False, right=False)

sample_palette = {'AK1':'#FF0000', 'iPSC':'#154360', 'H1': '#40E0D0', 'HUES64': '#437299', 'NPC':'#229954', 'HG002': '#545454'}

# Main Figure (Fig.1 Violinplot)
ak1_10kb = pd.read_table('AK1_10kbmet.tab', index_col=0, header=None)
hg002_10kb = pd.read_table('HG002_10kbmet.tab', index_col=0, header=None) # Added For Fig.S
ipsc_10kb = pd.read_table('iPSC_10kbmet.tab', index_col=0, header=None)
h1_10kb = pd.read_table('H1_10kbmet.tab', index_col=0, header=None)
hues64_10kb = pd.read_table('HUES64_10kbmet.tab', index_col=0, header=None)
npc_10kb = pd.read_table('NPC_10kbmet.tab', index_col=0, header=None)

df = pd.concat([ak1_10kb, ipsc_10kb, h1_10kb, hues64_10kb, npc_10kb], axis=1)
df.columns = ['AK1', 'iPSC', 'H1', 'HUES64', 'NPC']
df.index.name = 'Region'
df = df.dropna()

fig, axes = plt.subplots(1, 1, figsize=(8, 5), constrained_layout=True)
sns.violinplot(data=df, palette=sample_palette, cut=0, ax=axes)
axes.set_ylabel('10kb mCG Level (%)')
plt.savefig('Violin_10kb_met.png')

# Suppl Figure (Fig.SX Violinplot)
df = pd.concat([ak1_10kb, hg002_10kb, ipsc_10kb, h1_10kb, hues64_10kb, npc_10kb], axis=1)
df.columns = ['AK1', 'HG002', 'iPSC', 'H1', 'HUES64', 'NPC']
df.index.name = 'Region'
df = df.dropna()

fig, axes = plt.subplots(1, 1, figsize=(8, 5), constrained_layout=True)
sns.violinplot(data=df, palette=sample_palette, cut=0, ax=axes)
axes.set_ylabel('10kb mCG Level (%)')
plt.savefig('Violin_10kb_met_wHG002.png')

### PCA using 10kb methylation

In [None]:
df_10kb = pd.concat([ak1_10kb, hg002_10kb, ipsc_10kb, h1_10kb, hues64_10kb, npc_10kb], axis=1)
df_10kb.columns = ['AK1', 'HG002', 'iPSC', 'H1', 'HUES64', 'NPC']
df_10kb.index.name = 'Region'
df_10kb = df_10kb.dropna()

df_10kb_normalized = (df_10kb - df_10kb.mean()) / df_10kb.std()
pca = PCA(n_components=2)
pca.fit(df_10kb_normalized.T)
transformed_data = pca.transform(df_10kb_normalized.T)

labels = ['AK1', 'HG002', 'iPSC', 'NPC', 'H1', 'HUES64']
fig, ax = plt.subplots(figsize=(7, 7), constrained_layout=True)
for i in range(len(transformed_data)):
    ax.scatter(transformed_data[i][0], transformed_data[i][1],
               color = sample_palette[df_10kb_normalized.columns[i]],
               s = 200)
    ax.annotate(labels[i],
                (transformed_data[i][0], transformed_data[i][1]), 
                xytext = (5,5), 
                textcoords = 'offset points')

ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}%)')
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}%)')

plt.savefig('PCA_mCG_10kb.png')

### Random 10kb mCG outside PMD (Supplementary Figure)

In [None]:
dir='/mnt/data/Projects/phenomata/01.Projects/13.AK1_PacBio/01.DNA/Analysis_2023/10kb_methylation/'

ak1_random10k = pd.read_table(f"{dir}AK1_10kb_random10k_met.tab", index_col=0, names=['AK1'], header=None)
hg002_random10k = pd.read_table(f"{dir}HG002_10kb_random10k_met.tab", names=['HG002'], index_col=0)
ipsc_random10k = pd.read_table(f"{dir}iPSC_10kb_random10k_met.tab", names=['iPSC'], index_col=0)
h1_random10k = pd.read_table(f"{dir}H1_10kb_random10k_met.tab", names=['H1'], index_col=0)
hues64_random10k = pd.read_table(f"{dir}HUES64_10kb_random10k_met.tab", names=['HUES64'], index_col=0)
npc_random10k = pd.read_table(f"{dir}NPC_10kb_random10k_met.tab", names=['NPC'], index_col=0)
merge_random10k = pd.concat([ak1_random10k, hg002_random10k, ipsc_random10k, h1_random10k, hues64_random10k, npc_random10k], axis=1)

from matplotlib.colors import ListedColormap

fig, ax = plt.subplots(squeeze=False, figsize=(10, 11))
# For NA values
heatmap1 = sns.heatmap(
    np.where(merge_random10k.isna(), 0, np.nan),
    ax = ax[0, 0],
    cbar = False,
    xticklabels = True,
    yticklabels = False,
    annot = None,
    fmt = "",
    cmap = ListedColormap(['grey']),
    linewidth = 0)
# For non-NA values
heatmap2 = sns.heatmap(
    merge_random10k,
    ax = ax[0, 0],
    xticklabels = True,
    yticklabels = False,
    annot = None, 
    fmt = "",
    cmap = cmap,
    linewidth = 0.001, 
    linecolor = "black",
    vmin = 0,
    vmax = 100)
ax[0, 0].set_ylabel('Randomly selected 10kb tiles')
heatmap2.collections[1].colorbar.set_label(label='DNA methylation (%)', rotation=270, labelpad=10)
plt.savefig(f'{dir}Randomly_Selected_10kbtiles_outside_PMD_mCG_Heatmap_wHG002.png')

### CpG Density of PMD

In [None]:
'''/mnt/mone/Project/AK1_PacBio/01.DNA/Analysis_Samples_Merged/DNA_Methylation_Analysis/PMD/CpG_Density
bedtools intersect -v -a hg38_primary_chrX_10kb.bed -b hg38.analysisSet_N.bed > hg38_primary_chrX_10kb_woN.bed
bedtools intersect -a hg38_primary_chrX_10kb_woN.bed -b hg38_primary_chrX_CpG.bed -c > hg38_primary_chrX_10kb_woN_CpGcount.bed
bedtools intersect -a hg38_primary_chrX_10kb_woN_CpGcount.bed -b AK1_PMD_HiFi.bed > 10kb_CpG_number_wAK1PMD.bed
bedtools intersect -v -a hg38_primary_chrX_10kb_woN_CpGcount.bed -b AK1_PMD_HiFi.bed > 10kb_CpG_number_woAK1PMD.bed
'''

cpg_wPMD = pd.read_table("10kb_CpG_number_wAK1PMD.bed", header=None)
cpg_woPMD = pd.read_table("10kb_CpG_number_woAK1PMD.bed", header=None)

cpg_wPMD = cpg_wPMD[[3]]
cpg_woPMD = cpg_woPMD[[3]]

cpg_wPMD['PMD'] = 'PMD'
cpg_woPMD['PMD'] = 'Non-PMD'

cpg_density = pd.concat([cpg_wPMD, cpg_woPMD], axis=0)
cpg_density['Normalized_CpG'] = (cpg_density.iloc[:, 0] - cpg_density.iloc[:, 0].min()) / ( cpg_density.iloc[:, 0].max() - cpg_density.iloc[:, 0].min())

fig, ax = plt.subplots(figsize=(5, 5), constrained_layout=True)
sns.violinplot(data=cpg_density, x='PMD', y='Normalized_CpG', palette={'PMD': 'darkkhaki', 'Non-PMD': 'black'}, ax = ax, cut=0)
ax.set_xlabel('')
ax.set_ylabel('Normalized CpG Density of 10kb tiles')
plt.savefig('Normalized_CpG_Density_PMD_Non-PMD.pdf')


### PMD from AK1 aligned onto CHM13 genome

In [None]:
# To see which regions of PMD are not covered by H1 and HUES64
dir = '/mnt/data/Projects/phenomata/01.Projects/13.AK1_PacBio/01.DNA/CHM13/'

ak1_chm13_pmd = pd.read_table(f"{dir}AK1_chm13_AK1_chm13_HiFi_PMD_met.txt", index_col=0)
ipsc_chm13_pmd = pd.read_table(f"{dir}iPSC_chm13_AK1_chm13_HiFi_PMD_met.txt", index_col=0)
h1_chm13_pmd = pd.read_table(f"{dir}H1_AK1_chm13_HiFi_PMD_met.txt", index_col=0)
hues64_chm13_pmd = pd.read_table(f"{dir}HUES64_AK1_chm13_HiFi_PMD_met.txt", index_col=0)
npc_chm13_pmd = pd.read_table(f"{dir}NPC_chm13_AK1_chm13_HiFi_PMD_met.txt", index_col=0)
merge_chm13_pmd = pd.concat([ak1, ipsc, h1, hues64, npc], axis=1)

merge = merge[~merge.index.str.startswith('chrY')]