### Modules

In [None]:
# basic
import os, sys, glob, pybedtools
import numpy as np, pandas as pd
from Bio import SeqIO
import matplotlib.pyplot as plt, seaborn as sns
from matplotlib.cm import ScalarMappable
from matplotlib.ticker import FormatStrFormatter
from scipy.stats import fisher_exact

### Figure 5A

In [None]:
### 'hepg2tx_m6atm_biotype.csv' contains trasncript-level m6A prediction labeled by biotype 
### reference transcript file: GRCh38_rna_ensembl.fa'

In [None]:
pred_type = pd.read_csv('../data/hepg2tx_m6atm_biotype.csv')
type_table = pred_type.groupby(by = 'type')['transcript'].count().reset_index(name = 'site')
type_table = type_table.sort_values(by = 'site', ascending = False)
others = pd.DataFrame({'type': 'others', 'site': type_table.iloc[4:,:].site.sum()}, index=[0])
type_table = pd.concat([type_table.iloc[:4,:], others])
values = type_table.site.tolist()

In [None]:
### pie chart
sns.set_theme() # theme
fig, ax = plt.subplots(figsize = (8, 8)) # figure size

labels = type_table.type.tolist()
sizes = values
explode = [0, 0.1, 0.1, 0.1, 0]

ax.pie(sizes, labels = labels, autopct = '%1.1f%%', shadow = False, startangle = 30, explode = explode, pctdistance = 1.2, labeldistance = None, 
       colors = sns.color_palette('Set2')[0:5], wedgeprops={'linewidth': 0.5})

fig.legend(labels, loc = 'upper right')

### Figure 5B

In [None]:
### 'hepg2_m6atm_region.csv' contains m6A prediction labeled by regions 
### ensembl-annotated reference region files(3'UTR, Exons, 5'UTR) were downloaded from UCSC table browser

In [None]:
pred_region = pd.read_csv('../data/hepg2_m6atm_region.csv')
region_table = pred_region.groupby(by = 'region')['region'].count().reset_index(name = 'count')
region_table = region_table.sort_values(by = 'count', ascending = False)
values = region_table['count'].tolist()

In [None]:
### pie chart
sns.set_theme() # theme
fig, ax = plt.subplots(figsize = (8, 8)) # figure size

labels = region_table.region.tolist()
sizes = values

ax.pie(sizes, labels = labels, autopct = '%1.1f%%', shadow = False, startangle = 30, pctdistance = 1.2, labeldistance = None, 
       colors = sns.color_palette()[0:3], wedgeprops={'linewidth': 0.5})

fig.legend(labels, loc = 'upper right')

### Figure 5E

In [None]:
pred = pd.read_csv('../data/hepg2_m6atm.csv')
pred_m6a = pred[(pred['m6a'].str.contains('yes')) & (pred.coverage>=100)]

In [None]:
### gene list 
file = open('../data/GOgene_hepg2.csv', 'w')
for gene in list(set(pred_m6a.name2)):
    
	file.write(gene + '\n')
    
file.close()

In [None]:
### bar plot 
sns.set_theme(style = 'white') # theme
tab_color = sns.color_palette() # color palette
fig, ax = plt.subplots(figsize = (25, 6)) # figure size

### data
GO_table = pd.read_csv('../data/GOtable_hepg2.txt', sep = '\t')
pval = GO_table.PValue[0:10]
terms = [i.split('~')[1] for i in GO_table.Term][0:10]
counts = GO_table['Count'][0:10]

### cmap
data_color = [x/max(pval) for x in pval]
my_cmap = plt.cm.get_cmap('viridis_r')
colors = my_cmap(data_color)

x = np.arange(len(terms))  # the label locations
width = 0.5  # the width of the bars

rects = ax.barh(x, counts, color = colors)
sm = ScalarMappable(cmap = my_cmap, norm = plt.Normalize(vmin = min(pval), vmax = max(pval)))
sm.set_array([])

cbar = plt.colorbar(sm)
cbar.ax.tick_params(labelsize = 20)
cbar.set_label('p-value ($10^{-5}$)', rotation = 270, labelpad = 40, fontsize = 28)

ax.set_xlabel('Counts', fontsize = 36)
ax.set_yticks(x)
ax.set_yticklabels(terms)
ax.invert_yaxis()  # labels read top-to-bottom
ax.tick_params(labelsize = 25)

### Figure 5E

In [None]:
pred = pd.read_csv('../data/hepg2_m6atm.csv')
pred_m6a = pred[pred['m6a'].str.contains('yes')]

In [None]:
with open('../data/motif_hepg2.csv', 'w') as f:
    for motif in pred_m6a['motif']:
        f.write(motif.replace('T', 'U')+'\n')

### Figure 5F

In [None]:
pred = pd.read_csv('../data/hepg2_m6atm.csv')
pred_m6a = pred[pred['m6a'].str.contains('yes')]

In [None]:
gene_table = pred_m6a.groupby('name2', as_index = False).agg({'ratio': 'mean', 'gn_site': 'count', 'coverage': 'mean'})
gene_table = gene_table[gene_table.gn_site>=15]
gene_table = gene_table[gene_table.coverage>=50]
gene_table = gene_table.sort_values('ratio', ascending = False)

In [None]:
##### main
sns.set_theme(style = 'white') # theme
tab_color = sns.color_palette() # color palette
fig, ax = plt.subplots(figsize = (20, 5)) # figure size

# data
top_genes = gene_table.name2[0:10].tolist()
top_df = pred_m6a[pred_m6a.name2.isin(top_genes)]
top_df = top_df.set_index('name2')
top_df = top_df.loc[top_genes]
top_df['name2'] = top_df.index

# plot
sns.boxplot(x = 'name2', y = 'ratio', data = top_df, width = 0.2,
            showmeans = True, meanprops = {'marker': 'P','markerfacecolor': 'yellow', 'markeredgecolor': 'black', 'markersize': '10'})
sns.stripplot(x = 'name2', y = 'ratio', data = top_df,
              hue = 'name2', size = 4, edgecolor = 'black', linewidth = 0.5, legend = False)

# Set transparancy for all box
for patch in ax.patches:
    r, g, b, a = patch.get_facecolor()
    patch.set_facecolor((r, g, b, .6))
    
ax.set_ylabel('m6A ratio', fontsize = 25)
ax.set(xlabel = None)
ax.tick_params(labelsize = 20)
plt.xticks(rotation = 45)