In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from scipy.stats import pearsonr, spearmanr
sns.set_theme(font="arial", font_scale=1.15, style='ticks')
plt.rcParams['figure.figsize'] = (6,6)
plt.rc("axes.spines", top=False, right=False)
sample_palette = {'AK1':'#FF0000', 'iPSC':'#154360','NPC':'#229954'}


## GRCh38

### 10kb windows

In [None]:
ak1_10kb_hetero = pd.read_table("AK1_10kb_CG_Heterogeneity.tab", index_col=0)
ipsc_10kb_hetero = pd.read_table("iPSC_10kb_CG_Heterogeneity.tab", index_col=0)
npc_10kb_hetero = pd.read_table("NPC_10kb_CG_Heterogeneity.tab", index_col=0)

ak1_10kb_hetero['CG_Used_Prop'] = ak1_10kb_hetero.CG_Used * 100 / ak1_10kb_hetero.Ref_CG
ipsc_10kb_hetero['CG_Used_Prop'] = ipsc_10kb_hetero.CG_Used * 100 / ipsc_10kb_hetero.Ref_CG
npc_10kb_hetero['CG_Used_Prop'] = npc_10kb_hetero.CG_Used * 100 / npc_10kb_hetero.Ref_CG

# Check if there are Outliers
fig, axes = plt.subplots(1,3, figsize=(18, 6), constrained_layout=True)
for u,v in enumerate([ak1_10kb_hetero, ipsc_10kb_hetero, npc_10kb_hetero]):
    
    
    sns.regplot(data=v, x="Mean_euc", y="Spanned_Reads", 
                scatter_kws={"fc": sample_palette[list(sample_palette)[u]], "ec": sample_palette[list(sample_palette)[u]], "s": 15, "alpha": 0.2}, 
                line_kws={"lw":1, "color": "black", "alpha": 0.5}, ax=axes[u])
    axes[u].set_xlabel("Z-score normalized mCG Heterogeneity")
    axes[u].set_ylabel("The number of fully spanned reads in the 10kb window")
    axes[u].set_title(list(sample_palette)[u])

ak1_10kb_hetero['Mean_euc_zscore'] = zscore(ak1_10kb_hetero['Mean_euc'])
ipsc_10kb_hetero['Mean_euc_zscore'] = zscore(ipsc_10kb_hetero['Mean_euc'])
npc_10kb_hetero['Mean_euc_zscore'] = zscore(npc_10kb_hetero['Mean_euc'])

ak1_10kb_hetero['Mean_euc_zscore'] = zscore(ak1_10kb_hetero['Mean_euc'])
ipsc_10kb_hetero['Mean_euc_zscore'] = zscore(ipsc_10kb_hetero['Mean_euc'])
npc_10kb_hetero['Mean_euc_zscore'] = zscore(npc_10kb_hetero['Mean_euc'])

# Check if Mean_euc is not a good representation of Heterogeneity 
fig, axes = plt.subplots(1,3, figsize=(18, 6), constrained_layout=True)
for u,v in enumerate([ak1_10kb_hetero, ipsc_10kb_hetero, npc_10kb_hetero]):
    sns.regplot(data=v, x="Mean_euc_zscore", y="Spanned_Reads", 
                scatter_kws={"fc": sample_palette[list(sample_palette)[u]], "ec": sample_palette[list(sample_palette)[u]], "s": 15, "alpha": 0.2}, 
                line_kws={"lw":1, "color": "black", "alpha": 0.5}, ax=axes[u])
    axes[u].set_xlabel("Z-score normalized mCG Heterogeneity")
    axes[u].set_ylabel("The number of fully spanned reads in the 10kb window")
    axes[u].set_title(list(sample_palette)[u])

# Check if Mean_euc is not a good representation of Heterogeneity => It was not => Window에 CG 많을 수록 Het 높음 그래서 PMD에서 Euc 낮은것.
fig, axes = plt.subplots(1,3, figsize=(18, 6), constrained_layout=True)
for u,v in enumerate([ak1_10kb_hetero, ipsc_10kb_hetero, npc_10kb_hetero]):
    sns.regplot(data=v, x="Mean_euc_zscore", y="CG_Used", 
                scatter_kws={"fc": sample_palette[list(sample_palette)[u]], "ec": sample_palette[list(sample_palette)[u]], "s": 15, "alpha": 0.2}, 
                line_kws={"lw":1, "color": "black", "alpha": 0.5}, ax=axes[u])
    axes[u].set_xlabel("Z-score normalized mCG Heterogeneity")
    axes[u].set_ylabel("The number of CG in the window")
    axes[u].set_title(list(sample_palette)[u])



In [None]:
cg_used_prop = pd.concat([ak1_10kb_hetero.CG_Used_Prop, ipsc_10kb_hetero.CG_Used_Prop, npc_10kb_hetero.CG_Used_Prop], axis=1)
cg_used_prop.columns = ['AK1', 'iPSC', 'NPC']

g = sns.displot(cg_used_prop, fill=True, alpha=0.1, kind='kde', palette=sample_palette)
g.set_xlabels("Percentage of CpG used relative to the Reference CpG in each window")

ak1_10kb_hetero_5 = ak1_10kb_hetero[(ak1_10kb_hetero.Spanned_Reads >= 5)]
ipsc_10kb_hetero_5 = ipsc_10kb_hetero[(ipsc_10kb_hetero.Spanned_Reads >= 5)]
npc_10kb_hetero_5 = npc_10kb_hetero[(npc_10kb_hetero.Spanned_Reads >= 5)]
ak1_10kb_hetero_5['Mean_euc_zscore'] = zscore(ak1_10kb_hetero_5['Mean_euc'])
ipsc_10kb_hetero_5['Mean_euc_zscore'] = zscore(ipsc_10kb_hetero_5['Mean_euc'])
npc_10kb_hetero_5['Mean_euc_zscore'] = zscore(npc_10kb_hetero_5['Mean_euc'])

total_10kb_hetero = pd.concat([ak1_10kb_hetero.loc[:, ['Mean_euc']], ipsc_10kb_hetero.loc[:, ['Mean_euc']], npc_10kb_hetero.loc[:, ['Mean_euc']]], axis=1)
total_10kb_hetero.columns = ['AK1', 'iPSC', 'NPC']

total_10kb_hetero_5 = pd.concat([ak1_10kb_hetero_5.loc[:, ['Mean_euc']], ipsc_10kb_hetero_5.loc[:, ['Mean_euc']], npc_10kb_hetero_5.loc[:, ['Mean_euc']]], axis=1)
total_10kb_hetero.columns = ['AK1', 'iPSC', 'NPC']



### 10kb Window DNA methylation - 10kb window DNA methylation Heterogeneity

In [None]:
metdir="/mnt/data/Projects/phenomata/01.Projects/13.AK1_PacBio/01.DNA/Analysis_2023/10kb_methylation/"
ak1_10kb_met = pd.read_table(f'{metdir}AK1_10kbmet.tab', index_col=0, header=None)
ak1_10kb_met.columns = ['mCG_10kb']
ipsc_10kb_met = pd.read_table(f'{metdir}iPSC_10kbmet.tab', index_col=0, header=None)
ipsc_10kb_met.columns = ['mCG_10kb']
npc_10kb_met = pd.read_table(f'{metdir}NPC_10kbmet.tab', index_col=0, header=None)
npc_10kb_met.columns = ['mCG_10kb']

ak1_10kb_hetero_mCG = pd.concat([ak1_10kb_hetero['Mean_euc_zscore'], ak1_10kb_met['mCG_10kb']], axis=1)
pearsonr(ak1_10kb_hetero_mCG.dropna(how='any').Mean_euc_zscore, ak1_10kb_hetero_mCG.dropna(how='any').mCG_10kb)


# Circular Binary Segmentation

## GRCh38

Using https://github.com/kylessmith/linear_segment/tree/main

In [None]:
#source activate linear_segment
from linear_segment import segment
import numpy as np
import pandas as pd
'''
np.random.seed(10)
x = np.random.random(300000)
x[10000:20000] = x[10000:20000] + 0.1
x[25000:27000] = x[25000:27000] - 1
labels = np.repeat('a', len(x))

segments = segment(x, labels, method="cbs")
'''

ak1_10kb_hetero = pd.read_table("AK1_10kb_CG_Heterogeneity.tab", index_col=0)

test_hetero = ak1_10kb_hetero[ak1_10kb_hetero.chrom == 'chr20']
test = np.array(test_hetero.Mean_euc)
labels = np.repeat('a', len(test))
segments = segment(test, labels, method="cbs")

with open("test_chr20_AK1_HetEuc_Changepoint.bed", 'w') as rfh:
    for i in range(len(segments)):
        seg = str(segments[i]).lstrip('Interval(').split(',')[0].split('-')
        seg_start = int(segment[0])
        seg_end = int(segment[1]) -1
        chrom = test_hetero.iloc[seg_start]['chrom']
        start = test_hetero.iloc[seg_start]['start']
        end = test_hetero.iloc[seg_end]['end']
        MeanOfMeanEuc = test_hetero.iloc[seg_start: seg_end+1]['Mean_euc'].mean()
        MedianOfMeanEuc = test_hetero.iloc[seg_start: seg_end+1]['Mean_euc'].median()
        rfh.write(f'{chrom}\t{start}\t{end}\t{round(MeanOfMeanEuc, 5)}/{round(MedianOfMeanEuc, 5)}\n')
        rfh.flush()
        


In [None]:
# Split Chromosome
chroms = list(map(lambda x: f'chr{x}', range(1,23)))
chroms.append('chrX')