In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib ##
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
sns.set(font="Arial", font_scale=1.2, style='ticks')
plt.rc("axes.spines", top=False, right=False)

#%matplotlib inline
sample_palette = {'AK1':'#FF0000', 'iPSC':'#154360', 'H1': '#40E0D0', 'HUES64': '#437299', 'NPC':'#229954', 'HG002': '#545454'}
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["#104e8b", "#ffdab9", "#8b0a50"])

### Gain-Loss counts for AK1-iPSC-NPC relationship

In [None]:
'''
Sample	Counts	Type	PMD
AK1-iPSC	305570	Gain	PMD
AK1-iPSC	79615	Gain	non-PMD
AK1-iPSC	3710	Loss	NA
iPSC-NPC	4010	Gain	NA
iPSC-NPC	10422	Loss	NA
'''
gainloss = pd.read_table("mCG_Gain_Loss_through_AK1-iPSC-NPC.tab")
gainloss['Type_PMD'] = gainloss.apply(lambda x: f"{x['Type']} ({x['PMD']})" if x['Sample'] == 'AK1-iPSC' and x['Type'] == 'Gain' else x['Type'], axis=1)
gainloss['Sample_Type'] = gainloss.apply(lambda x: f"{x['Sample']} {x['Type']}", axis=1)
gainloss_pivot = gainloss.pivot_table(index='Sample_Type', columns='Type_PMD', values='Counts', aggfunc='sum')

colors_dic = {'Gain': '#7F0622', 'Gain (PMD)': '#B2697A', 'Gain (non-PMD)': '#7F0622', 'Loss': '#370080'}
ax = gainloss_pivot.plot(kind='bar', stacked=True, rot=0, color=colors_dic, figsize=(5,8))
ax.set_yticklabels([f'{int(y/10**4)}' for y in ax.get_yticks()])
for xticklabel in ax.get_xticklabels():
    xticklabel.set_rotation(30)

ax.set_ylabel('Number of DMRs (x10,000)')
ax.set_xlabel('')
plt.tight_layout()
plt.savefig()

# Adding Further Aesthetics through Adobe Illustrator

In [None]:
# Main Figure (Fig.1 Violinplot)
ak1 = pd.read_table('AK1_DMRsmet.tab', index_col=0, header=None)
ipsc = pd.read_table('iPSC_DMRsmet.tab', index_col=0, header=None)
h1 = pd.read_table('H1_DMRsmet.tab', index_col=0, header=None)
hues64 = pd.read_table('HUES64_DMRsmet.tab', index_col=0, header=None)
npc = pd.read_table('NPC_DMRsmet.tab', index_col=0, header=None)

df_dmrs = pd.concat([ak1, ipsc, h1, hues64, npc], axis=1)
df_dmrs.columns.name = 'Sample'
df_dmrs.columns = ['AK1', 'iPSC', 'H1', 'HUES64', 'NPC']
df_dmrs.index.name = 'Region'

df_dmrs = df_dmrs.dropna()


def row_scaling(row):
    return (row - row.mean()) / row.std()

df_dmrs_scaled = df_dmrs.apply(row_scaling, axis=1)

within_cluster_ss = list() # Within-cluster Sum of squares
for k in range(5, 21):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(df_dmrs_scaled)
    within_cluster_ss.append(kmeans.inertia_)

plt.plot(range(5, 21), within_cluster_ss, marker='o')
plt.show()

kmeans = KMeans(n_clusters=12)
kmeans.fit(df_dmrs_scaled)

df_dmrs['K-means cluster (K=12)'] = kmeans.labels_

'''
df_dmrs_melted = pd.melt(df_dmrs, id_vars='K-means cluster (K=12)', var_name='variable', value_name='value')

fig, axes = plt.subplots(1, 1, figsize=(8, 5), constrained_layout=True)
sns.heatmap(df_dmrs_melted.pivot_table(index='K-means cluster (K=12)', columns='variable', values='value'),
            cmap=cmap)

sns.heatmap(df_dmrs.sort_values(by='K-means cluster (K=12)').iloc[:, :5], xticklabels=True, yticklabels=False, vmin=0, vmax=100, cmap=cmap)
plt.tight_layout()
'''

df_dmrs_dic = dict()
for k in range(0, 12):
    df_names = f'K{k}'
    temp_df = df_dmrs[df_dmrs['K-means cluster (K=12)'] == k]
    temp_df_filtered = temp_df.iloc[:, :5]
    temp_df_filtered_reset = temp_df_filtered.reset_index()
    temp_df_melted = pd.melt(temp_df_filtered_reset, id_vars='Region', var_name='Sample', value_name='DNA methylation (%)')
    df_dmrs_dic[df_names] = temp_df_melted


df_dmrs_list = [df_dmrs_dic[f'K{k}'] for k in range(0,12)]
df_dmrs_titles = [f'C{k+1} (n={format(len(np.unique(df_dmrs_dic[f"K{k}"]["Region"].values)), ",")})' for k in range(0,12)] # the number of DMRs for each K cluster: np.unique(df_dmrs_dic['K11']['Region'].values)


fig, axes = plt.subplots(4, 3, figsize=(11, 18), constrained_layout=True)
for data, title, ax in zip(df_dmrs_list, df_dmrs_titles, axes.flat):
    
    sns.pointplot(data=data, x='Sample', y='DNA methylation (%)', color='black', errorbar="sd", scale=0.5, ax=ax)
    ax.set_xlabel('')
    ax.set_title(title, fontweight='bold')
    ax.set_ylim(0,100)

plt.savefig('DMR_K-means_clusters_C1-C12.png')


'''
df_dmrs_k0 = df_dmrs[df_dmrs['K-means cluster (K=12)'] == 0]
df_dmrs_k0_filtered = df_dmrs_k0.iloc[:, :5]
df_dmrs_k0_filtered_reset = df_dmrs_k0_filtered.reset_index()
df_dmrs_k0_melted = pd.melt(df_dmrs_k0_filtered_reset, id_vars='Region', var_name='Sample', value_name='DNA methylation (%)')
catplot = sns.catplot(data=df_dmrs_k0_melted, x='Sample', y='DNA methylation (%)', kind='point', estimator='median', errorbar=('ci', 95), color='black', height=7, aspect=1.2)
catplot.ax.set_ylim(0,100)
plt.savefig('test_DMR_K-means_cluster0_mCG.png')
'''





fig, axes = plt.subplots(1, 1, figsize=(8, 5), constrained_layout=True)
sns.violinplot(data=df, palette=sample_palette, cut=0, ax=axes)
axes.set_ylabel('10kb mCG Level (%)')
plt.savefig('Violin_10kb_met.png')

# Suppl Figure (Fig.SX Violinplot)
df = pd.concat([ak1_10kb, hg002_10kb, ipsc_10kb, h1_10kb, hues64_10kb, npc_10kb], axis=1)
df.columns = ['AK1', 'HG002', 'iPSC', 'H1', 'HUES64', 'NPC']
df.index.name = 'Region'
df = df.dropna()

fig, axes = plt.subplots(1, 1, figsize=(8, 5), constrained_layout=True)
sns.violinplot(data=df, palette=sample_palette, cut=0, ax=axes)
axes.set_ylabel('10kb mCG Level (%)')
plt.savefig('Violin_10kb_met_wHG002.png')