In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import matplotlib.ticker as ticker
import seaborn as sns

sns.set_theme(font="arial", font_scale=1.15, style='ticks')
plt.rcParams['figure.figsize'] = (6,6)
plt.rc("axes.spines", top=False, right=False)

sample_palette = {'AK1':'#FF0000', 'iPSC':'#154360','NPC':'#229954'}

### GRCh38

In [None]:
ak1 = pd.read_table('./AK1_collapsed_classification.filtered_lite_classification.txt')
ipsc = pd.read_table('./iPSC_collapsed_classification.filtered_lite_classification.txt')
npc = pd.read_table('./NPC_collapsed_classification.filtered_lite_classification.txt')

name_replace = {'novel_not_in_catalog': 'NNC', 'incomplete-splice_match': 'ISM', 'full-splice_match': 'FSM', 'novel_in_catalog': 'NIC', 'intergenic': 'Intergenic', 'antisense': 'Antisense', 'fusion': 'Fusion', 'genic': 'Genic', 'moreJunctions': 'Others'}
name_order = ['NNC', 'NIC', 'FSM', 'ISM', 'Genic', 'Intergenic', 'Antisense', 'Fusion', 'Others']
ak1['structural_category'] = ak1['structural_category'].replace(name_replace)
ipsc['structural_category'] = ipsc['structural_category'].replace(name_replace)
npc['structural_category'] = npc['structural_category'].replace(name_replace)


In [None]:
ak1_category_counts = ak1['structural_category'].value_counts()
ak1_category_percentages = ak1_category_counts / len(ak1) * 100

ipsc_category_counts = ipsc['structural_category'].value_counts()
ipsc_category_percentages = ipsc_category_counts / len(ipsc) * 100

npc_category_counts = npc['structural_category'].value_counts()
npc_category_percentages = npc_category_counts / len(npc) * 100

In [None]:
fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ak1_category_percentages.values, y=ak1_category_percentages.index, color=sample_palette['AK1'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('AK1 MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('AK1_isoform_category_GRCh38.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ipsc_category_percentages.values, y=ipsc_category_percentages.index, color=sample_palette['iPSC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('iPSC MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('iPSC_isoform_category_GRCh38.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=npc_category_percentages.values, y=npc_category_percentages.index, color=sample_palette['NPC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('NPC MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('NPC_isoform_category_GRCh38.png')
plt.clf()

### T2T-CHM13v2.0

In [None]:
ak1 = pd.read_table('./AK1_chm13_collapsed_classification.filtered_lite_classification.txt')
ipsc = pd.read_table('./iPSC_chm13_collapsed_classification.filtered_lite_classification.txt')
npc = pd.read_table('./NPC_chm13_collapsed_classification.filtered_lite_classification.txt')

name_replace = {'novel_not_in_catalog': 'NNC', 'incomplete-splice_match': 'ISM', 'full-splice_match': 'FSM', 'novel_in_catalog': 'NIC', 'intergenic': 'Intergenic', 'antisense': 'Antisense', 'fusion': 'Fusion', 'genic': 'Genic', 'moreJunctions': 'Others'}
name_order = ['NNC', 'NIC', 'FSM', 'ISM', 'Genic', 'Intergenic', 'Antisense', 'Fusion', 'Others']
ak1['structural_category'] = ak1['structural_category'].replace(name_replace)
ipsc['structural_category'] = ipsc['structural_category'].replace(name_replace)
npc['structural_category'] = npc['structural_category'].replace(name_replace)

In [None]:
ak1_category_counts = ak1['structural_category'].value_counts()
ak1_category_percentages = ak1_category_counts / len(ak1) * 100

ipsc_category_counts = ipsc['structural_category'].value_counts()
ipsc_category_percentages = ipsc_category_counts / len(ipsc) * 100

npc_category_counts = npc['structural_category'].value_counts()
npc_category_percentages = npc_category_counts / len(npc) * 100

In [None]:
fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ak1_category_percentages.values, y=ak1_category_percentages.index, color=sample_palette['AK1'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('AK1 MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('AK1_isoform_category_CHM13.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ipsc_category_percentages.values, y=ipsc_category_percentages.index, color=sample_palette['iPSC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('iPSC MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('iPSC_isoform_category_CHM13.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=npc_category_percentages.values, y=npc_category_percentages.index, color=sample_palette['NPC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('NPC MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('NPC_isoform_category_CHM13.png')
plt.clf()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font="arial", font_scale=1.15, style='ticks')
plt.rcParams['figure.figsize'] = (6,6)
plt.rc("axes.spines", top=False, right=False)

ak1 = pd.read_table('./AK1_collapsed_classification.filtered_lite_classification.txt')
ipsc = pd.read_table('./iPSC_collapsed_classification.filtered_lite_classification.txt')
npc = pd.read_table('./NPC_collapsed_classification.filtered_lite_classification.txt')

name_replace = {'novel_not_in_catalog': 'NNC', 'incomplete-splice_match': 'ISM', 'full-splice_match': 'FSM', 'novel_in_catalog': 'NIC', 'intergenic': 'Intergenic', 'antisense': 'Antisense', 'fusion': 'Fusion', 'genic': 'Genic', 'moreJunctions': 'Others'}
name_order = ['NNC', 'NIC', 'FSM', 'ISM', 'Genic', 'Intergenic', 'Antisense', 'Fusion', 'Others']
ak1['structural_category'] = ak1['structural_category'].replace(name_replace)
ipsc['structural_category'] = ipsc['structural_category'].replace(name_replace)
npc['structural_category'] = npc['structural_category'].replace(name_replace)

ak1_category_counts = ak1['structural_category'].value_counts()
ak1_category_percentages = ak1_category_counts / len(ak1) * 100

ipsc_category_counts = ipsc['structural_category'].value_counts()
ipsc_category_percentages = ipsc_category_counts / len(ipsc) * 100

npc_category_counts = npc['structural_category'].value_counts()
npc_category_percentages = npc_category_counts / len(npc) * 100

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ak1_category_percentages.values, y=ak1_category_percentages.index, color=sample_palette['AK1'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('AK1 MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('AK1_isoform_category_GRCh38.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ipsc_category_percentages.values, y=ipsc_category_percentages.index, color=sample_palette['iPSC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('iPSC MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('iPSC_isoform_category_GRCh38.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=npc_category_percentages.values, y=npc_category_percentages.index, color=sample_palette['NPC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('NPC MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('NPC_isoform_category_GRCh38.png')
plt.clf()

ak1 = pd.read_table('./AK1_chm13_collapsed_classification.filtered_lite_classification.txt')
ipsc = pd.read_table('./iPSC_chm13_collapsed_classification.filtered_lite_classification.txt')
npc = pd.read_table('./NPC_chm13_collapsed_classification.filtered_lite_classification.txt')

name_replace = {'novel_not_in_catalog': 'NNC', 'incomplete-splice_match': 'ISM', 'full-splice_match': 'FSM', 'novel_in_catalog': 'NIC', 'intergenic': 'Intergenic', 'antisense': 'Antisense', 'fusion': 'Fusion', 'genic': 'Genic', 'moreJunctions': 'Others'}
name_order = ['NNC', 'NIC', 'FSM', 'ISM', 'Genic', 'Intergenic', 'Antisense', 'Fusion', 'Others']
ak1['structural_category'] = ak1['structural_category'].replace(name_replace)
ipsc['structural_category'] = ipsc['structural_category'].replace(name_replace)
npc['structural_category'] = npc['structural_category'].replace(name_replace)

ak1_category_counts = ak1['structural_category'].value_counts()
ak1_category_percentages = ak1_category_counts / len(ak1) * 100

ipsc_category_counts = ipsc['structural_category'].value_counts()
ipsc_category_percentages = ipsc_category_counts / len(ipsc) * 100

npc_category_counts = npc['structural_category'].value_counts()
npc_category_percentages = npc_category_counts / len(npc) * 100

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ak1_category_percentages.values, y=ak1_category_percentages.index, color=sample_palette['AK1'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('AK1 MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('AK1_isoform_category_CHM13.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ipsc_category_percentages.values, y=ipsc_category_percentages.index, color=sample_palette['iPSC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('iPSC MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('iPSC_isoform_category_CHM13.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=npc_category_percentages.values, y=npc_category_percentages.index, color=sample_palette['NPC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('NPC MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('NPC_isoform_category_CHM13.png')
plt.clf()

### Coding Probability

In [None]:
'''
ak1_prob = pd.read_table("AK1.ORF_prob.best.tsv", index_col=0)

fig, ax = plt.subplots(figsize=(10,8), constrained_layout=True)
sns.scatterplot(data=ak1_prob, x='mRNA', y='Coding_prob', alpha=0.1, ax=ax)
sns.histplot(ak1_prob['mRNA'], ax=ax, fill=True, bins=30)
ax2 = ax.twinx()
sns.histplot(ak1_prob['Coding_prob'], ax=ax2, fill=True, bins=30)
plot = sns.jointplot(data=ak1_prob, x='mRNA', y='Coding_prob', alpha=0.01, marginal_kws=dict(bins=30, fill=False), s=2)
plot.set_axis_labels('Isoform Length (bp)', 'Coding Probability')
'''
pb_prob = pd.read_table("Merged.ORF_prob.best.filtered.tsv", index_col=0)
pb_prob['CPAT_Prediction'] = np.where(pb_prob['Coding_prob'] >= 0.364, 'Coding', 'Noncoding') #https://cpat.readthedocs.io/en/latest/#how-to-choose-cutoff

fig, ax = plt.subplots(figsize=(10,8), constrained_layout=True)
sns.scatterplot(data=pb_prob, x='mRNA', y='Coding_prob', alpha=0.1, ax=ax)
sns.histplot(pb_prob['mRNA'], ax=ax, fill=True, bins=30)
ax2 = ax.twinx()
sns.histplot(pb_prob['Coding_prob'], ax=ax2, fill=True, bins=30)
plot = sns.jointplot(data=pb_prob, x='mRNA', y='Coding_prob', alpha=0.01, marginal_kws=dict(bins=30, fill=False), s=2)
plot.set_axis_labels('Isoform Length (bp)', 'Coding Probability')

p = sns.kdeplot(data=pb_prob, x='mRNA', y='Coding_prob', cmap="afmhot", fill=True, thresh=0)
p.set_xlabel("Isoform Length (bp)")
p.set_ylabel("Coding Probability")
p.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
p.set_xlim((pb_prob['mRNA'].min(), pb_prob['mRNA'].max()))
p.set_ylim((0,1))
plt.savefig('Isoform_Lenght_Coding_Probability.pdf')
plt.clf()

In [None]:
pb_prob.reset_index(inplace=True)
pb_prob['pbid'] = pb_prob['seq_ID'].str.split('|').str[0]
pb_prob.set_index('pbid', inplace=True)

In [None]:
pb_class = pd.read_table("/mnt/data/Projects/phenomata/01.Projects/13.AK1_PacBio/02.RNA/New_2024/NEW/Merged_mapped_modified_classification.filtered_lite_classification.txt", index_col=0)
'''
pb_class['structural_category'].value_counts()
Out[10]: 
incomplete-splice_match    525501
full-splice_match          259452
novel_not_in_catalog       245964
novel_in_catalog           172559
intergenic                  14586
antisense                   10554
fusion                       7715
genic                        3839
moreJunctions                 187
Name: structural_category, dtype: int64
'''
'''From dict.fromkeys(set(pb_class['structural_category'].value_counts().index))'''
st_category = {'antisense': 'AS',
               'full-splice_match': 'FSM',
               'intergenic': 'Intergenic',
               'incomplete-splice_match': 'ISM',
               'fusion': 'Fusion',
               'genic': 'Genic',
               'novel_in_catalog': 'NIC',
               'moreJunctions': 'mJ',
               'novel_not_in_catalog': 'NNC'}
st_category_order = ['FSM', 'ISM', 'NIC', 'NNC', 'Intergenic', 'AS', 'Fusion', 'Genic', 'mJ']
st_category_brief = {'antisense': 'Others',
                     'full-splice_match': 'FSM',
                     'intergenic': 'Others',
                     'incomplete-splice_match': 'ISM',
                     'fusion': 'Others',
                     'genic': 'Others',
                     'novel_in_catalog': 'NIC',
                     'moreJunctions': 'Others',
                     'novel_not_in_catalog': 'NNC'}
st_category_brief_order = ['FSM', 'ISM', 'NIC', 'NNC', 'Others']

pb_class['sc_abbrev'] = pb_class['structural_category'].map(st_category)
pb_class['sc_abbrev_brief'] = pb_class['structural_category'].map(st_category_brief)

In [None]:
pb_class['CPAT_Prediction'] = pb_class.index.map(pb_prob['CPAT_Prediction'])
'''
pb_class[pb_class['CPAT_Prediction'].isna()]
=> pbid whose ORF not found when executing CPAT
So make it to Noncoding category
'''
pb_class['CPAT_Prediction'].fillna('Noncoding', inplace=True)

In [None]:
# Structural Category
count_table = pb_class.groupby(['sc_abbrev', 'CPAT_Prediction']).size().reset_index(name='count')
count_table['sc_abbrev'] = pd.Categorical(count_table['sc_abbrev'], categories=st_category_order, ordered=True)
count_table.rename(columns={"CPAT_Prediction": "Coding Potential"}, inplace=True)

g = sns.catplot(data=count_table.sort_values(by='sc_abbrev'),
                x='sc_abbrev',
                y='count',
                hue='Coding Potential',
                kind='bar',
                palette={'Coding':'#00FFFF', 'Noncoding':'#FFD700'},
                height=7,
                aspect=1.5
                )
g.set_axis_labels("Isoform Category", "Transcript ($\\times10^4$)")
max_count = count_table['count'].max()
yticks = list(range(0, max_count, 50000))
yticklabels = [str(int(ytick/10000)) for ytick in yticks]
g.set(yticks=yticks, yticklabels=yticklabels)
g.ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:.0f}'.format(x/10000)))

# Brief Structural Category
count_table_br = pb_class.groupby(['sc_abbrev_brief', 'CPAT_Prediction']).size().reset_index(name='count')
count_table_br['sc_abbrev_brief'] = pd.Categorical(count_table_br['sc_abbrev_brief'], categories=st_category_brief_order, ordered=True)
count_table_br.rename(columns={"CPAT_Prediction": "Coding Potential"}, inplace=True)

g = sns.catplot(data=count_table_br.sort_values(by='sc_abbrev_brief'),
                x='sc_abbrev_brief',
                y='count',
                hue='Coding Potential',
                kind='bar',
                palette={'Coding':'#00FFFF', 'Noncoding':'#FFD700'},
                height=7,
                aspect=1.5
                )
g.set_axis_labels("Isoform Category", "Transcript ($\\times10^4$)")
max_count_br = count_table_br['count'].max()
yticks = list(range(0, max_count_br, 50000))
yticklabels = [str(int(ytick/10000)) for ytick in yticks]
g.set(yticks=yticks, yticklabels=yticklabels)
g.ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:.0f}'.format(x/10000)))