In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
from sklearn.decomposition import PCA

#cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["#104e8b", "#ffdab9", "#8b0a50"])
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["#FFFF00", "#000000", "#0066CC"])

sns.set(font="Arial", font_scale=1.15, style='ticks')
plt.rc("axes.spines", top=False, right=False)
%matplotlib
%autoindent

### HOMER Known motifs (AK1-iPSC DMR)

In [1]:
'''
knownResults_HyperDMRs_DSS_woPMD_AK1-iPSC_MetTh40.txt
knownResults_HypoDMRs_DSS_woPMD_AK1-iPSC_MetTh40.txt
'''
enrich_cutoff = 1.3
pctTarget_cutoff = 10
pctBackground_cutoff = 20
qval_cutoff = 0.01

# AK1-iPSC iPSC Low
hyper_ak1_ipsc = pd.read_table("knownResults_HyperDMRs_DSS_woPMD_AK1-iPSC_MetTh40.txt")
hyper_ak1_ipsc = hyper_ak1_ipsc.drop(hyper_ak1_ipsc.columns[[1, 5, 7]], axis=1)
hyper_ak1_ipsc = hyper_ak1_ipsc.rename(columns={'Motif Name': 'Motif', 
                                                'P-value': 'pval', 
                                                'Log P-value': 'logpval', 
                                                'q-value (Benjamini)': 'qval',
                                                '% of Target Sequences with Motif': 'pctTarget',
                                                '% of Background Sequences with Motif': 'pctBackground'})
hyper_ak1_ipsc[['TF', 'BindingDomain']] = hyper_ak1_ipsc['Motif'].str.split('(', expand=True)[[0,1]]
hyper_ak1_ipsc['BindingDomain'] = hyper_ak1_ipsc['BindingDomain'].str.split(')', expand=True)[0]
cols = ['TF', 'BindingDomain'] + [col for col in hyper_ak1_ipsc.columns if col not in ['TF', 'BindingDomain']]
hyper_ak1_ipsc = hyper_ak1_ipsc[cols]
hyper_ak1_ipsc['pctTarget'] = hyper_ak1_ipsc['pctTarget'].str.rstrip('%').astype('float')
hyper_ak1_ipsc['pctBackground'] = hyper_ak1_ipsc['pctBackground'].str.rstrip('%').astype('float')
hyper_ak1_ipsc['logpval_calc'] = hyper_ak1_ipsc['logpval'].abs()
hyper_ak1_ipsc['logpval_calc'] = -np.log10(hyper_ak1_ipsc['pval'])
maxlogpval_calc = hyper_ak1_ipsc['logpval_calc'][hyper_ak1_ipsc['logpval_calc'] != np.inf].max()
hyper_ak1_ipsc['logpval_calc'] = hyper_ak1_ipsc['logpval_calc'].replace(np.inf, maxlogpval_calc + 1)
del hyper_ak1_ipsc['logpval']

# Filtering
hyper_ak1_ipsc = hyper_ak1_ipsc[(hyper_ak1_ipsc['pctTarget'] / hyper_ak1_ipsc['pctBackground']) > enrich_cutoff]
hyper_ak1_ipsc = hyper_ak1_ipsc[(hyper_ak1_ipsc['pctTarget'] >= pctTarget_cutoff) & (hyper_ak1_ipsc['pctBackground'] <= pctBackground_cutoff)]
hyper_ak1_ipsc = hyper_ak1_ipsc[hyper_ak1_ipsc['qval'] < qval_cutoff]
hyper_ak1_ipsc['DMR'] = 'iPSC<AK1'
hyper_ak1_ipsc

# AK1-iPSC iPSC High
hypo_ak1_ipsc = pd.read_table("knownResults_HypoDMRs_DSS_woPMD_AK1-iPSC_MetTh40.txt")
hypo_ak1_ipsc = hypo_ak1_ipsc.drop(hypo_ak1_ipsc.columns[[1, 5, 7]], axis=1)
hypo_ak1_ipsc = hypo_ak1_ipsc.rename(columns={'Motif Name': 'Motif', 
                                                'P-value': 'pval', 
                                                'Log P-value': 'logpval', 
                                                'q-value (Benjamini)': 'qval',
                                                '% of Target Sequences with Motif': 'pctTarget',
                                                '% of Background Sequences with Motif': 'pctBackground'})
hypo_ak1_ipsc[['TF', 'BindingDomain']] = hypo_ak1_ipsc['Motif'].str.split('(', expand=True)[[0,1]]
hypo_ak1_ipsc['BindingDomain'] = hypo_ak1_ipsc['BindingDomain'].str.split(')', expand=True)[0]
cols = ['TF', 'BindingDomain'] + [col for col in hypo_ak1_ipsc.columns if col not in ['TF', 'BindingDomain']]
hypo_ak1_ipsc = hypo_ak1_ipsc[cols]
hypo_ak1_ipsc['pctTarget'] = hypo_ak1_ipsc['pctTarget'].str.rstrip('%').astype('float')
hypo_ak1_ipsc['pctBackground'] = hypo_ak1_ipsc['pctBackground'].str.rstrip('%').astype('float')
hypo_ak1_ipsc['logpval'] = hypo_ak1_ipsc['logpval'].abs()
hypo_ak1_ipsc['logpval_calc'] = -np.log10(hypo_ak1_ipsc['pval'])
maxlogpval_calc = hypo_ak1_ipsc['logpval_calc'][hypo_ak1_ipsc['logpval_calc'] != np.inf].max()
hypo_ak1_ipsc['logpval_calc'] = hypo_ak1_ipsc['logpval_calc'].replace(np.inf, maxlogpval_calc + 1)
del hypo_ak1_ipsc['logpval']

# Filtering
hypo_ak1_ipsc = hypo_ak1_ipsc[(hypo_ak1_ipsc['pctTarget'] / hypo_ak1_ipsc['pctBackground']) > enrich_cutoff]
hypo_ak1_ipsc = hypo_ak1_ipsc[(hypo_ak1_ipsc['pctTarget'] >= pctTarget_cutoff) & (hypo_ak1_ipsc['pctBackground'] <= pctBackground_cutoff)]
hypo_ak1_ipsc = hypo_ak1_ipsc[hypo_ak1_ipsc['qval'] < qval_cutoff]
hypo_ak1_ipsc['DMR'] = 'iPSC>AK1'
hypo_ak1_ipsc

homer_ak1_ipsc = pd.concat([hyper_ak1_ipsc, hypo_ak1_ipsc], axis=0)
homer_ak1_ipsc['TF'] = homer_ak1_ipsc['TF'].str.upper()
homer_ak1_ipsc = homer_ak1_ipsc.sort_values(by='BindingDomain', ascending=False)

# Main Figure
fig, ax = plt.subplots(figsize=(6,6), constrained_layout=True)
sns.scatterplot(data=homer_ak1_ipsc, x='DMR', y='TF', size='pctTarget', hue='logpval_calc', sizes=(100, 250), palette='copper_r', ax=ax)
ax.set_xticklabels(['AK1 > iPSC', 'AK1 < iPSC'], rotation=15)
ax.set_xlabel('')
ax.set_xlim([-1,2])
ax.set_ylabel('')
legend_labels  = ['10', '15', '20', '25']
legend_handles = [plt.Line2D([0], [0], marker='o', color='white', markerfacecolor='black', markersize=int(sz)) for sz in legend_labels]
ax.legend(handles=legend_handles, labels=legend_labels, title='% with Motif', frameon=False, fancybox=False, edgecolor='black', bbox_to_anchor=(1.05, 0), loc='lower left')
plt.savefig("HOMER_Motif_Enrichment_AK1-iPSC.pdf")

# Colorbar for -log10(P value)
vmin = homer_ak1_ipsc['logpval_calc'].min()
vmax = homer_ak1_ipsc['logpval_calc'].max()

fig, ax = plt.subplots(figsize=(3, 2), constrained_layout=True)
#fig.subplots_adjust(bottom=0.5)
colormap = cm.copper_r
norm = plt.Normalize(vmin, vmax)
colorbar = cm.ScalarMappable(cmap=colormap, norm=norm)
colorbar.set_array([])

fig.colorbar(colorbar, ax=ax, orientation='horizontal', label='-log10(P value)')
plt.axis('off')
plt.savefig("HOMER_Motif_Enrichment_AK1-iPSC_log10pvaluecolorbar.pdf")



NameError: name 'pd' is not defined

### HOMER Known motifs (iPSC-NPC DMR)

In [None]:
'''
knownResults_HyperDMRs_DSS_iPSC-NPC_MetTh40.txt
knownResults_HypoDMRs_DSS_iPSC-NPC_MetTh40.txt
'''
enrich_cutoff = 1.3
pctTarget_cutoff = 10
pctBackground_cutoff = 20
qval_cutoff = 0.01

# iPSC-NPC iPSC High
hyper_ipsc_npc = pd.read_table("knownResults_HyperDMRs_DSS_iPSC-NPC_MetTh40.txt")
hyper_ipsc_npc = hyper_ipsc_npc.drop(hyper_ipsc_npc.columns[[1, 5, 7]], axis=1)
hyper_ipsc_npc = hyper_ipsc_npc.rename(columns={'Motif Name': 'Motif', 
                                                'P-value': 'pval', 
                                                'Log P-value': 'logpval', 
                                                'q-value (Benjamini)': 'qval',
                                                '% of Target Sequences with Motif': 'pctTarget',
                                                '% of Background Sequences with Motif': 'pctBackground'})
hyper_ipsc_npc[['TF', 'BindingDomain']] = hyper_ipsc_npc['Motif'].str.split('(', expand=True)[[0,1]]
hyper_ipsc_npc['BindingDomain'] = hyper_ipsc_npc['BindingDomain'].str.split(')', expand=True)[0]
cols = ['TF', 'BindingDomain'] + [col for col in hyper_ipsc_npc.columns if col not in ['TF', 'BindingDomain']]
hyper_ipsc_npc = hyper_ipsc_npc[cols]
hyper_ipsc_npc['pctTarget'] = hyper_ipsc_npc['pctTarget'].str.rstrip('%').astype('float')
hyper_ipsc_npc['pctBackground'] = hyper_ipsc_npc['pctBackground'].str.rstrip('%').astype('float')
hyper_ipsc_npc['logpval_calc'] = hyper_ipsc_npc['logpval'].abs()
hyper_ipsc_npc['logpval_calc'] = -np.log10(hyper_ipsc_npc['pval'])
maxlogpval_calc = hyper_ipsc_npc['logpval_calc'][hyper_ipsc_npc['logpval_calc'] != np.inf].max()
hyper_ipsc_npc['logpval_calc'] = hyper_ipsc_npc['logpval_calc'].replace(np.inf, maxlogpval_calc + 1)
del hyper_ipsc_npc['logpval']

# Filtering
hyper_ipsc_npc = hyper_ipsc_npc[(hyper_ipsc_npc['pctTarget'] / hyper_ipsc_npc['pctBackground']) > enrich_cutoff]
hyper_ipsc_npc = hyper_ipsc_npc[(hyper_ipsc_npc['pctTarget'] >= pctTarget_cutoff) & (hyper_ipsc_npc['pctBackground'] <= pctBackground_cutoff)]
hyper_ipsc_npc = hyper_ipsc_npc[hyper_ipsc_npc['qval'] < qval_cutoff]
hyper_ipsc_npc['DMR'] = 'iPSC>NPC'
hyper_ipsc_npc


# iPSC-NPC iPSC Low
hypo_ipsc_npc = pd.read_table("knownResults_HypoDMRs_DSS_iPSC-NPC_MetTh40.txt")
hypo_ipsc_npc = hypo_ipsc_npc.drop(hypo_ipsc_npc.columns[[1, 5, 7]], axis=1)
hypo_ipsc_npc = hypo_ipsc_npc.rename(columns={'Motif Name': 'Motif', 
                                                'P-value': 'pval', 
                                                'Log P-value': 'logpval', 
                                                'q-value (Benjamini)': 'qval',
                                                '% of Target Sequences with Motif': 'pctTarget',
                                                '% of Background Sequences with Motif': 'pctBackground'})
hypo_ipsc_npc[['TF', 'BindingDomain']] = hypo_ipsc_npc['Motif'].str.split('(', expand=True)[[0,1]]
hypo_ipsc_npc['BindingDomain'] = hypo_ipsc_npc['BindingDomain'].str.split(')', expand=True)[0]
cols = ['TF', 'BindingDomain'] + [col for col in hypo_ipsc_npc.columns if col not in ['TF', 'BindingDomain']]
hypo_ipsc_npc = hypo_ipsc_npc[cols]
hypo_ipsc_npc['pctTarget'] = hypo_ipsc_npc['pctTarget'].str.rstrip('%').astype('float')
hypo_ipsc_npc['pctBackground'] = hypo_ipsc_npc['pctBackground'].str.rstrip('%').astype('float')
hypo_ipsc_npc['logpval'] = hypo_ipsc_npc['logpval'].abs()
hypo_ipsc_npc['logpval_calc'] = -np.log10(hypo_ipsc_npc['pval'])
maxlogpval_calc = hypo_ipsc_npc['logpval_calc'][hypo_ipsc_npc['logpval_calc'] != np.inf].max()
hypo_ipsc_npc['logpval_calc'] = hypo_ipsc_npc['logpval_calc'].replace(np.inf, maxlogpval_calc + 1)
del hypo_ipsc_npc['logpval']

# Filtering
hypo_ipsc_npc = hypo_ipsc_npc[(hypo_ipsc_npc['pctTarget'] / hypo_ipsc_npc['pctBackground']) > enrich_cutoff]
hypo_ipsc_npc = hypo_ipsc_npc[(hypo_ipsc_npc['pctTarget'] >= pctTarget_cutoff) & (hypo_ipsc_npc['pctBackground'] <= pctBackground_cutoff)]
hypo_ipsc_npc = hypo_ipsc_npc[hypo_ipsc_npc['qval'] < qval_cutoff]
hypo_ipsc_npc['DMR'] = 'iPSC>AK1'
hypo_ipsc_npc

homer_ipsc_npc = pd.concat([hyper_ipsc_npc, hypo_ipsc_npc], axis=0)
homer_ipsc_npc['TF'] = homer_ipsc_npc['TF'].str.upper()
homer_ipsc_npc = homer_ipsc_npc.sort_values(by='BindingDomain', ascending=False)

# Main Figure
fig, ax = plt.subplots(figsize=(6.5,12), constrained_layout=True)
sns.scatterplot(data=homer_ipsc_npc, x='DMR', y='TF', size='pctTarget', hue='logpval_calc', sizes=(100, 600), palette='copper_r', ax=ax)
ax.set_xticklabels(['iPSC > NPC', 'iPSC < NPC'], rotation=15)
ax.set_xlabel('')
ax.set_xlim([-1,2])
ax.set_ylabel('')
legend_labels  = ['15', '25', '35', '45']
legend_handles = [plt.Line2D([0], [0], marker='o', color='white', markerfacecolor='black', markersize=int(sz)) for sz in legend_labels]
ax.legend(handles=legend_handles, labels=legend_labels, title='% with Motif', frameon=False, fancybox=False, edgecolor='black', bbox_to_anchor=(1.05, 0), loc='lower left')
plt.savefig("HOMER_Motif_Enrichment_iPSC-NPC.pdf")

# Colorbar for -log10(P value)
vmin = homer_ipsc_npc['logpval_calc'].min()
vmax = homer_ipsc_npc['logpval_calc'].max()

fig, ax = plt.subplots(figsize=(3, 2), constrained_layout=True)
#fig.subplots_adjust(bottom=0.5)
colormap = cm.copper_r
norm = plt.Normalize(vmin, vmax)
colorbar = cm.ScalarMappable(cmap=colormap, norm=norm)
colorbar.set_array([])

fig.colorbar(colorbar, ax=ax, orientation='horizontal', label='-log10(P value)')
plt.axis('off')
plt.savefig("HOMER_Motif_Enrichment_iPSC-NPC_log10pvaluecolorbar.pdf")

