In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib ##
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
sns.set(font="Arial", font_scale=1.2, style='ticks')
plt.rc("axes.spines", top=False, right=False)
import gseapy as gp

#%matplotlib inline
sample_palette = {'AK1':'#FF0000', 'iPSC':'#154360', 'H1': '#40E0D0', 'HUES64': '#437299', 'NPC':'#229954', 'HG002': '#545454'}
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["#104e8b", "#ffdab9", "#8b0a50"])

### Gain-Loss counts for AK1-iPSC-NPC relationship

In [None]:
# /mnt/mone/Project/AK1_PacBio/01.DNA/Analysis_Samples_Merged/DNA_Methylation_Analysis/DMR/DSS/DSS_Outputs/Gain_Loss_Counts
# AK1-iPSC DMRs not overlapped with PMD

rfh = open("mCG_Gain_Loss_through_AK1-iPSC-NPC.tab", 'w')
rfh.write('Sample\tCounts\tType\tPMD\n')
rfh.flush()
'''
output table:
AK1-iPSC	#	Gain	PMD
AK1-iPSC	#	Gain	non-PMD
AK1-iPSC	#	Loss	NA
iPSC-NPC	#	Gain	NA
iPSC-NPC	#	Loss	NA
'''

# DMRs in AK1-iPSC relationship
dfh = open("DMRs_DSS_woPMD_AK1-iPSC_MetTh40.bed", 'r')
gain_nonPMD, loss_nonPMD = 0, 0
for i in dfh:
	line = i.strip().split('\t')
	if float(line[3].split('/')[1].strip(')')) > 0:
		loss_nonPMD += 1
	else:
		gain_nonPMD += 1
dfh.close()

dfh = open("DMRs_DSS_AK1-iPSC_MetTh40.bed", 'r')
gain_total, loss_total = 0, 0
for i in dfh:
	line = i.strip().split('\t')
	if float(line[3].split('/')[1].strip(')')) > 0:
		loss_total += 1
	else:
		gain_total += 1

rfh.write('AK1-iPSC\t' + str(gain_total - gain_nonPMD) + '\tGain\tPMD\n')
rfh.write('AK1-iPSC\t' + str(gain_nonPMD) + '\tGain\tnon-PMD\n')
rfh.write('AK1-iPSC\t' + str(loss_nonPMD) + '\tLoss\tNA\n')
rfh.flush()

# DMRs in iPSC-NPC relationship
dfh = open("DMRs_DSS_iPSC-NPC_MetTh40.bed", 'r')
gain, loss = 0, 0
for i in dfh:
	line = i.strip().split('\t')
	if float(line[3].split('/')[1].strip(')')) > 0:
		loss += 1
	else:
		gain += 1
dfh.close()

print(gain)
print(loss)
rfh.write('iPSC-NPC\t' + str(gain) + '\tGain\tNA\n')
rfh.write('iPSC-NPC\t' + str(loss) + '\tLoss\tNA\n')
rfh.flush()

# /mnt/data/Projects/phenomata/01.Projects/13.AK1_PacBio/01.DNA/Analysis_2023/DMR
'''
Sample	Counts	Type	PMD
AK1-iPSC	305570	Gain	PMD
AK1-iPSC	79615	Gain	non-PMD
AK1-iPSC	3710	Loss	NA
iPSC-NPC	4010	Gain	NA
iPSC-NPC	10422	Loss	NA
'''
gainloss = pd.read_table("mCG_Gain_Loss_through_AK1-iPSC-NPC.tab")
gainloss['Type_PMD'] = gainloss.apply(lambda x: f"{x['Type']} ({x['PMD']})" if x['Sample'] == 'AK1-iPSC' and x['Type'] == 'Gain' else x['Type'], axis=1)
gainloss['Sample_Type'] = gainloss.apply(lambda x: f"{x['Sample']} {x['Type']}", axis=1)
gainloss_pivot = gainloss.pivot_table(index='Sample_Type', columns='Type_PMD', values='Counts', aggfunc='sum')

colors_dic = {'Gain': '#7F0622', 'Gain (PMD)': '#B2697A', 'Gain (non-PMD)': '#7F0622', 'Loss': '#370080'}
ax = gainloss_pivot.plot(kind='bar', stacked=True, rot=0, color=colors_dic, figsize=(5,8))
ax.set_yticklabels([f'{int(y/10**4)}' for y in ax.get_yticks()])
for xticklabel in ax.get_xticklabels():
    xticklabel.set_rotation(30)

ax.set_ylabel('Number of DMRs (x10,000)')
ax.set_xlabel('')
plt.tight_layout()
plt.savefig("DMR_counts_gain-loss.pdf")

# Adding Further Aesthetics through Adobe Illustrator

### Distance to the nearest TSS

In [None]:
#!/mnt/mone/Project/WC300/Tools/Anaconda3/envs/pygenometracks/bin/python
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from matplotlib.ticker import NullLocator
sns.set(font="arial", font_scale=1.15, style='ticks')
plt.rc("axes.spines", top=False, right=False)

os.system('bedtools closest -d -a DMRs_DSS_ALL_MetTh40.merged.bed -b TSS.gencode.v41.annotation.basic.bed > Closest_DMR.bed')
os.system('cut -f1-4,9 Closest_DMR.bed > temp_uniq_distance.bed')

df = pd.read_table('temp_uniq_distance.bed', header=None, names=['chrom', 'start', 'end', 'DMR', 'Distance'])

df['Distance_plus1'] = df['Distance'] + 1
mask = df['Distance_plus1'] > 1

# Define the number of bins and the range of the data for the non-zero values
n_bins = 30
data_range = [1, 4000000]

# Generate logarithmically spaced bin edges for the non-zero values
bin_edges = np.logspace(np.log10(data_range[0]), np.log10(data_range[1]), n_bins + 1)

fig, ax = plt.subplots(figsize=(7,5), constrained_layout=True)

ax.hist(df.loc[mask, 'Distance_plus1'], bins=bin_edges, color='black')
ax.set_xscale('log')

ax.hist(df.loc[~mask, 'Distance_plus1'], bins=3, color='black')
ax.xaxis.set_minor_locator(NullLocator())

def format_func(x, pos):
    if x == 1:
        return '0'
    elif x == 10:
        return '0.01'
    elif x == 100:
        return '0.1'
    elif x == 1000:
        return '1'
    elif x == 10000:
        return '10'
    elif x == 100000:
        return '100'
    elif x == 1000000:
        return '1,000'
    else:
        return ''

ax.xaxis.set_major_formatter(FuncFormatter(format_func))
ax.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
ax.set_xlabel("Distance to the nearest transcription start site (kb)")
ax.set_ylabel("Number of DMR")

plt.savefig("DMR_Distance_to_nearest_TSS.png")
os.system(f'/usr/bin/rm temp_uniq_distance.bed')

### K-means Clutering

In [None]:
# Main Figure (Fig.1 Violinplot)
ak1 = pd.read_table('AK1_DMRsmet.tab', index_col=0, header=None)
ipsc = pd.read_table('iPSC_DMRsmet.tab', index_col=0, header=None)
h1 = pd.read_table('H1_DMRsmet.tab', index_col=0, header=None)
hues64 = pd.read_table('HUES64_DMRsmet.tab', index_col=0, header=None)
npc = pd.read_table('NPC_DMRsmet.tab', index_col=0, header=None)

df_dmrs = pd.concat([ak1, ipsc, h1, hues64, npc], axis=1)
df_dmrs.columns.name = 'Sample'
df_dmrs.columns = ['AK1', 'iPSC', 'H1', 'HUES64', 'NPC']
df_dmrs.index.name = 'Region'

df_dmrs = df_dmrs.dropna()

def row_scaling(row):
    return (row - row.mean()) / row.std()

df_dmrs_scaled = df_dmrs.apply(row_scaling, axis=1)

within_cluster_ss = list() # Within-cluster Sum of squares
BIC = list() # Bayesian Information Criterion (BIC)
K = range(2, 21)
for k in K:
    # Fitting K-means model
    kmeans_test = KMeans(n_clusters=k, random_state=np.random.RandomState(seed=42)).fit(df_dmrs_scaled)
    # Store Within-cluster Sum of squares
    within_cluster_ss.append(kmeans_test.inertia_)
    # Fitting Gaussian Mixture Model
    gmm_test = GaussianMixture(n_components=k, means_init=kmeans_test.cluster_centers_).fit(df_dmrs_scaled)
    # Store BIC
    BIC.append(gmm_test.bic(df_dmrs_scaled))
    
# For Inertia
fig, ax = plt.subplots(1, 1, figsize=(4, 5), constrained_layout=True)
sns.lineplot(x=K, y=within_cluster_ss, color='black', ax=ax)
ax.set_yticklabels([f'{int(y/10**4)}' for y in ax.get_yticks()])
ax.set_ylabel('Within-Cluster Sum of Squares (x10,000)')
ax.set_xlabel('Number of K')

# For BIC
fig, ax = plt.subplots(1, 1, figsize=(4, 5), constrained_layout=True)
sns.lineplot(x=K, y=BIC, color='black', ax=ax)
ax.set_yticklabels([f'{float(y/10**6)}' for y in ax.get_yticks()])
ax.set_ylabel('Bayesian Information Criterion (x1,000,000)')
ax.set_xlabel('Number of K')

# Combined
fig, axes = plt.subplots(1, 2, figsize=(9, 5), constrained_layout=True)
sns.lineplot(x=K, y=within_cluster_ss, color='black', ax=axes[0])
axes[0].set_yticklabels([f'{int(y/10**4)}' for y in axes[0].get_yticks()])
axes[0].set_ylabel('Within-Cluster Sum of Squares (x10,000)')
axes[0].set_xlabel('Number of K')
sns.lineplot(x=K, y=BIC, color='black', ax=axes[1])
axes[1].set_yticklabels([f'{float(y/10**6)}' for y in axes[1].get_yticks()])
axes[1].set_ylabel('Bayesian Information Criterion (x1,000,000)')
axes[1].set_xlabel('Number of K')


#for xticklabel in ax.get_xticklabels():
#    xticklabel.set_rotation(30)

K = 6
kmeans_K6 = KMeans(n_clusters=K, random_state=np.random.RandomState(seed=42)).fit(df_dmrs_scaled)
df_dmrs[f'K-means cluster (K={K})'] = kmeans_K6.labels_

df_dmrs.to_csv("DMRs_Kmeans_K6.tab", sep='\t')

'''
df_dmrs_melted = pd.melt(df_dmrs, id_vars='K-means cluster (K=12)', var_name='variable', value_name='value')

fig, axes = plt.subplots(1, 1, figsize=(8, 5), constrained_layout=True)
sns.heatmap(df_dmrs_melted.pivot_table(index='K-means cluster (K=12)', columns='variable', values='value'),
            cmap=cmap)

sns.heatmap(df_dmrs.sort_values(by='K-means cluster (K=12)').iloc[:, :5], xticklabels=True, yticklabels=False, vmin=0, vmax=100, cmap=cmap)
plt.tight_layout()
'''

K = 6
K_range = range(0, K)
df_dmrs_dic = dict()
for k in K_range:
    df_names = f'K{k}'
    temp_df = df_dmrs[df_dmrs[f'K-means cluster (K={K})'] == k]
    temp_df_filtered = temp_df.iloc[:, :5]
    temp_df_filtered_reset = temp_df_filtered.reset_index()
    temp_df_melted = pd.melt(temp_df_filtered_reset, id_vars='Region', var_name='Sample', value_name='DNA methylation (%)')
    df_dmrs_dic[df_names] = temp_df_melted


df_dmrs_list = [df_dmrs_dic[f'K{k}'] for k in K_range]
df_dmrs_titles = [f'C{k+1} (n={format(len(np.unique(df_dmrs_dic[f"K{k}"]["Region"].values)), ",")})' for k in K_range] # the number of DMRs for each K cluster: np.unique(df_dmrs_dic['K11']['Region'].values)


fig, axes = plt.subplots(3, 2, figsize=(11, 18), constrained_layout=True)
for data, title, ax in zip(df_dmrs_list, df_dmrs_titles, axes.flat):
    
    sns.pointplot(data=data, x='Sample', y='DNA methylation (%)', color='black', errorbar="sd", scale=0.5, ax=ax)
    ax.set_xlabel('')
    ax.set_title(title, fontweight='bold')
    ax.set_ylim(0,100)

plt.savefig('DMR_K-means_clusters_C1-C6.png')


'''
df_dmrs_k0 = df_dmrs[df_dmrs['K-means cluster (K=12)'] == 0]
df_dmrs_k0_filtered = df_dmrs_k0.iloc[:, :5]
df_dmrs_k0_filtered_reset = df_dmrs_k0_filtered.reset_index()
df_dmrs_k0_melted = pd.melt(df_dmrs_k0_filtered_reset, id_vars='Region', var_name='Sample', value_name='DNA methylation (%)')
catplot = sns.catplot(data=df_dmrs_k0_melted, x='Sample', y='DNA methylation (%)', kind='point', estimator='median', errorbar=('ci', 95), color='black', height=7, aspect=1.2)
catplot.ax.set_ylim(0,100)
plt.savefig('test_DMR_K-means_cluster0_mCG.png')
'''

### Gene Ontology Analysis

In [None]:
K=6
K_range = range(0, K)
df_dmrs = pd.read_table(f"DMRs_Kmeans_K{6}.tab", index_col=0)
closest_tss = dict()

with open("Closest_DMR.bed", 'r') as dfh:
    for i in dfh:
        line = i.strip('\n').split('\t')
        closest_tss[line[0] + ':' + line[1] + '-' + line[2]] = line[3] + '/' + line[7]
        
        
def get_gene_name_from_index(dataframe):
    genelist = list()
    for index in dataframe.index:
        if closest_tss[index].split('/')[3] not in genelist:
            genelist.append(closest_tss[index].split('/')[3])
    return genelist
        
gobp = gp.get_library(name='GO_Biological_Process_2023', organism='Human')
kegg = gp.get_library(name='KEGG_2021_Human', organism='Human')
background_genes = get_gene_name_from_index(df_dmrs)

for k in K_range:
    observed_genes = df_dmrs[f'K-means cluster (K={K})' == k+1]
    



df_dmrs[f'K-means cluster (K={K})']
df_dmrs[df_dmrs[f'K-means cluster (K={K})'] == 1]



k1_genes = get_gene_name_from_index(df_dmrs[df_dmrs[f'K-means cluster (K={K})'] == 1])
enr_go = gp.enrichr(gene_list=k1_genes,
                 gene_sets=gobp,
                 background=background_genes,
                 outdir=None)
                  
enr_kegg = gp.enrichr(gene_list=k1_genes,
                 gene_sets=kegg,
                 background=background_genes,
                 outdir=None)