### Modules

In [1]:
# basic
import os, sys, glob, pybedtools
import numpy as np, pandas as pd
from Bio import SeqIO
import matplotlib.pyplot as plt, seaborn as sns
from matplotlib.cm import ScalarMappable
from matplotlib.ticker import FormatStrFormatter
from scipy.stats import fisher_exact

In [2]:
sys.path.insert(1, '/home/bo-yi/package/m6atm/m6atm/preprocess')
from ReadClass import *

In [3]:
sys.path.insert(1, '/home/bo-yi/package/m6atm/m6atm/train')
from ModelData import *

In [4]:
out_dir = '/home/bo-yi/paper'
data_dir = '/home/bo-yi/data/DRS/20211102_HepG2-WT/preprocessed'

tx_file = '/home/bo-yi/data/ref/enst_hg38Tables.tsv'
ref_tx = '/home/bo-yi/data/ref/GRCh38_rna_ensembl.fa'
ref_gn = '/home/bo-yi/data/ref/hg38.fa'

In [5]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

### Fig 6 supplementary

#### m6A peaks 

In [6]:
tx_df = pd.read_csv(tx_file, sep = '\t')
tx_df['name'] = [i.split('.')[0] for i in tx_df['name']]
ref_dict_gn = get_ref_dict(ref_gn)

In [7]:
all_table_tx = pd.read_csv(os.path.join(data_dir, 'results.csv'), index_col = 0)
m6a_table_tx = all_table_tx[all_table_tx.m6a == 'yes']
m6a_table_gn = tx_to_gn(m6a_table_tx, tx_df, ref_dict_gn)
m6a_sites = m6a_table_gn.gn_site.tolist()

In [8]:
all_table_tx = pd.read_csv(os.path.join(data_dir, 'results.csv'), index_col = 0)
all_table_gn = tx_to_gn(all_table_tx, tx_df, ref_dict_gn)

In [None]:
gene_list = ['CENPB', 'FOXA1', 'PAQR8', '']

In [None]:
peg10_df = all_table_gn.loc[all_table_gn['name2'] == 'PEG10',['chrom', 'gn_site', 'gn_pos', 'gn_pos_1', 'name2', 'probability', 'strand']]
peg10_df['m6a'] = peg10_df.gn_site.isin(m6a_sites)
peg10_df = peg10_df.loc[:,['chrom', 'gn_pos', 'gn_pos_1', 'name2', 'probability', 'strand', 'm6a']]
peg10_bedtools = pybedtools.BedTool.from_dataframe(df = peg10_df)

#### eCLIP data

In [None]:
names = ['chrom', 'chromStart', 'chromEnd', 'id', 'strand', 'name', 'exp', 'sample','accession']
rbp_bed = pd.read_csv('/home/bo-yi/data/ref/rbp38.txt', sep = '\t', header = None, names = names)
rbp_bed_hepg2_encode= rbp_bed[(rbp_bed['sample'].isin(['HepG2'])) & (rbp_bed['accession'].isin(['ENCODE']))]

In [None]:
rbp_list = sorted(list(set(rbp_bed_hepg2_encode.name)))

pval_list = []
or_list = []
val_list = []
for i in rbp_list:
    
    # eCLIP data
    eclip_bed = rbp_bed_hepg2_encode[rbp_bed_hepg2_encode['name'].isin([i])]
    eclip_bed = eclip_bed.loc[:,['chrom', 'chromStart', 'chromEnd', 'name']]
    eclip_bedtools = pybedtools.BedTool.from_dataframe(df = eclip_bed)
    
    # overlapping
    overlapping = peg10_bedtools.intersect(eclip_bedtools, wao = True)
    overlapping = overlapping.to_dataframe()
    overlapping = overlapping[~overlapping.loc[:,['chrom', 'start', 'end']].duplicated()]
    
    # frequency table
    val1 = overlapping[(overlapping['thickStart'] == True) & (overlapping['blockSizes'] == i)].shape[0]
    val2 = overlapping[(overlapping['thickStart'] == True) & (overlapping['blockSizes'] == '.')].shape[0]
    val3 = overlapping[(overlapping['thickStart'] == False) & (overlapping['blockSizes'] == i)].shape[0]
    val4 = overlapping[(overlapping['thickStart'] == False) & (overlapping['blockSizes'] == '.')].shape[0]
    
    # fisher test
    table = np.array([[val1, val2], [val3, val4]])
    res = fisher_exact(table, alternative = 'greater')
    
    pval_list.append(res[1])
    or_list.append(res[0])
    val_list.append([val1, val2, val3, val4])

In [None]:
val_list_zip = list(zip(*val_list))
result_table = pd.DataFrame({'rbp': rbp_list,
                             'odds': or_list,
                             'p_val': pval_list,
                             'val1': val_list_zip[0],
                             'val2': val_list_zip[1],
                             'val3': val_list_zip[2],
                             'val4': val_list_zip[3]})

In [None]:
result_table.sort_values(by = 'p_val', ascending = True).iloc[0:10]

In [None]:
##### main
sns.set_theme(style = 'whitegrid') # theme
tab_color = sns.color_palette() # color palette
fig, ax = plt.subplots(figsize = (12, 4)) # figure size

### plot
scatter_table = result_table.dropna()
scatter = plt.scatter(scatter_table['odds'], scatter_table['val1'], c = scatter_table['p_val'], s = 80, cmap = 'viridis_r', alpha = 0.8, 
                      linewidth = 1, edgecolors = 'black')

# cmap
pval = scatter_table['p_val']
data_color = [x/max(pval) for x in pval]
my_cmap = plt.cm.get_cmap('viridis_r')
colors = my_cmap(data_color)

sm = ScalarMappable(cmap = my_cmap, norm = plt.Normalize(vmin = min(pval), vmax = max(pval)))
sm.set_array([])

cbar = plt.colorbar(sm, pad = 0.01, format = FormatStrFormatter('%g'))
cbar.ax.tick_params(labelsize = 12)
cbar.set_label('$\\mathit{P}$ value', rotation = 270, labelpad = 30, fontsize = 20)
cbar.set_ticks(ticks = [0.05, 0.1, 0.2, 0.5])

# axis
ax.set_xlabel('Odds ratio', fontsize = 16)
ax.set_ylabel('Number of overlapping', fontsize = 16)
ax.tick_params(labelsize = 14)

plt.savefig(os.path.join(out_dir, 'fig5g.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
### pie chart
sns.set_theme(style = 'white') # theme
tab_color = sns.color_palette('Set2') # color palette
fig, ax = plt.subplots(figsize = (8, 8)) # figure size

y1 = result_table[result_table.isna().any(axis=1)].shape[0]
y2 = scatter_table.shape[0]

labels = ['No overlap', '≥1 overlap']
sizes = [y1, y2]

ax.pie(sizes, labels = labels, explode = [0, 0.05], autopct = lambda x: int(np.round(x/100*sum(sizes), 0)),
       shadow = False, startangle = 80, labeldistance = None, colors = [tab_color[0], tab_color[6]], 
       textprops={'fontsize': 26, 'weight': 'bold'},
       wedgeprops={'linewidth': 0.5})

fig.legend(labels, loc = 'upper right', fontsize = 20)

plt.savefig(os.path.join(out_dir, 'figS6.tif'), dpi = 300, bbox_inches = 'tight')

#### Overlapping sites

In [None]:
peg10_df = all_table_gn.loc[all_table_gn['name2'] == 'PEG10',['chrom', 'gn_site', 'gn_pos', 'gn_pos_1', 'name2', 'ratio', 'strand']]
peg10_df['m6a'] = peg10_df.gn_site.isin(m6a_sites)
peg10_df = peg10_df.loc[:,['chrom', 'gn_pos', 'gn_pos_1', 'name2', 'ratio', 'strand', 'm6a']]
peg10_bedtools = pybedtools.BedTool.from_dataframe(df = peg10_df)

In [None]:
table_list = []
for i in ['IGF2BP1', 'IGF2BP3']:

    # eCLIP data
    eclip_bed = rbp_bed_hepg2_encode[rbp_bed_hepg2_encode['name'].isin([i])]
    eclip_bed = eclip_bed.loc[:,['chrom', 'chromStart', 'chromEnd', 'name']]
    eclip_bedtools = pybedtools.BedTool.from_dataframe(df = eclip_bed)

    # overlapping
    overlapping = peg10_bedtools.intersect(eclip_bedtools, wao = True)
    overlapping = overlapping.to_dataframe()
    overlapping = overlapping[~overlapping.loc[:,['chrom', 'start', 'end']].duplicated()]

    # frequency table
    site_table = overlapping[(overlapping['thickStart'] == True) & (overlapping['blockSizes'] == i)]
    table_list.append(site_table)
    
site_table_i = table_list[0].merge(table_list[1], on = ['chrom', 'start', 'end', 'score'])

In [None]:
### to bedGraph
bedgraph = site_table_i.loc[:,['chrom', 'start', 'end', 'score']]
bedgraph.columns = ['chrom', 'chromStart', 'chromEnd', 'score']

with open(os.path.join(out_dir, 'peg10_overlap.bedGraph'), 'w') as f:

    f.write('track type=bedGraph name="ratio" description="m6ATM" color=238,31,137'+'\n')
    bedgraph.to_csv(f, sep = '\t', index = None, header = None)

In [None]:
rbp_list = ['IGF2BP1', 'IGF2BP3']

pval_list = []
or_list = []
val_list = []
for i in rbp_list:
    
    # eCLIP data
    eclip_bed = rbp_bed_hepg2_encode[rbp_bed_hepg2_encode['name'].isin([i])]
    eclip_bed = eclip_bed.loc[:,['chrom', 'chromStart', 'chromEnd', 'name']]
    eclip_bedtools = pybedtools.BedTool.from_dataframe(df = eclip_bed)
    
    # overlapping
    overlapping = peg10_bedtools.intersect(eclip_bedtools, wao = True)
    overlapping = overlapping.to_dataframe()
    overlapping = overlapping[~overlapping.loc[:,['chrom', 'start', 'end']].duplicated()]
    
    # frequency table
    val1 = overlapping[(overlapping['thickStart'] == True) & (overlapping['blockSizes'] == i)].shape[0]
    val2 = overlapping[(overlapping['thickStart'] == True) & (overlapping['blockSizes'] == '.')].shape[0]
    val3 = overlapping[(overlapping['thickStart'] == False) & (overlapping['blockSizes'] == i)].shape[0]
    val4 = overlapping[(overlapping['thickStart'] == False) & (overlapping['blockSizes'] == '.')].shape[0]
    
    # fisher test
    table = np.array([[val1, val2], [val3, val4]])
    res = fisher_exact(table, alternative = 'greater')
    
    pval_list.append(res[1])
    or_list.append(res[0])
    val_list.append([val1, val2, val3, val4])

In [None]:
##### main
sns.set_theme(style = 'white') # theme
tab_color = sns.color_palette('Set2') # color palette
fig, ax = plt.subplots(figsize = (2, 6)) # figure size

### plot
scatter_table = result_table.dropna()
x = ['RBPs']
y1 = result_table[result_table.isna().any(axis=1)].shape[0]
y2 = scatter_table.shape[0]


ax.bar(x, y1, label = 'No overlapping', align = 'center', width = 0.6, color = tab_color[0])
ax.bar(x, y2, bottom = y1, label = '≥1 DRACH site \noverlapping', align = 'center', width = 0.6, color = tab_color[6])

ax.legend(bbox_to_anchor = (1, 1), fontsize = 14)

# text
for bar in ax.patches:
    ax.text(bar.get_x() + bar.get_width()/2,
            bar.get_y() + bar.get_height()/2,
            bar.get_height(),
            ha = 'center',
            color = 'black',
            weight = 'bold',
            size = 20)

# axis
ax.set_xlim(-0.5, 0.5)
ax.set_ylabel('Count', fontsize = 20)
ax.tick_params(labelsize = 16)

plt.savefig(os.path.join(out_dir, 'fig5g_supp.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
bed_rbp_hepg2 = bed_rbp[(bed_rbp['sample'].isin(['HepG2'])) & (bed_rbp['name'].isin(m6abp))]
# bed_rbp_hepg2 = bed_rbp_hepg2.loc[:,['chrom', 'chromStart', 'chromEnd', 'name']]
# bed_rbp_hepg2.to_csv(os.path.join(out_dir, 'rbp_class1_hepg2.bed'), sep = '\t', index = None, header = None)

In [None]:
bed_rbp_igf2bp1 = bed_rbp[(bed_rbp['sample'].isin(['HepG2'])) & (bed_rbp['name'].isin(['IGF2BP1']))]
bed_rbp_igf2bp1 = bed_rbp_igf2bp1.loc[:,['chrom', 'chromStart', 'chromEnd', 'name']]
bed_rbp_igf2bp1.to_csv(os.path.join(out_dir, 'eclip_igf2bp1_hepg2.bed'), sep = '\t', index = None, header = None)

In [None]:
bed_rbp_igf2bp3 = bed_rbp[(bed_rbp['sample'].isin(['HepG2'])) & (bed_rbp['name'].isin(['IGF2BP3']))]
bed_rbp_igf2bp3 = bed_rbp_igf2bp3.loc[:,['chrom', 'chromStart', 'chromEnd', 'name']]
bed_rbp_igf2bp3.to_csv(os.path.join(out_dir, 'eclip_igf2bp3_hepg2.bed'), sep = '\t', index = None, header = None)

In [None]:
bed_rbp_hnrnpc = bed_rbp[(bed_rbp['sample'].isin(['HepG2'])) & (bed_rbp['name'].isin(['HNRNPC']))]
bed_rbp_hnrnpc = bed_rbp_hnrnpc.loc[:,['chrom', 'chromStart', 'chromEnd', 'name']]
bed_rbp_hnrnpc.to_csv(os.path.join(out_dir, 'eclip_hnrnpc_hepg2.bed'), sep = '\t', index = None, header = None)