In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import seaborn as sns
sns.set(font="arial", font_scale=1.15, style='ticks')
plt.rcParams['figure.figsize'] = (6,6)
plt.rc("axes.spines", top=False, right=False)

sample_palette = {'AK1':'#FF0000', 'iPSC':'#154360','NPC':'#229954'}

### GRCh38

In [None]:
ak1 = pd.read_table('./AK1_collapsed_classification.filtered_lite_classification.txt')
ipsc = pd.read_table('./iPSC_collapsed_classification.filtered_lite_classification.txt')
npc = pd.read_table('./NPC_collapsed_classification.filtered_lite_classification.txt')

name_replace = {'novel_not_in_catalog': 'NNC', 'incomplete-splice_match': 'ISM', 'full-splice_match': 'FSM', 'novel_in_catalog': 'NIC', 'intergenic': 'Intergenic', 'antisense': 'Antisense', 'fusion': 'Fusion', 'genic': 'Genic', 'moreJunctions': 'Others'}
name_order = ['NNC', 'NIC', 'FSM', 'ISM', 'Genic', 'Intergenic', 'Antisense', 'Fusion', 'Others']
ak1['structural_category'] = ak1['structural_category'].replace(name_replace)
ipsc['structural_category'] = ipsc['structural_category'].replace(name_replace)
npc['structural_category'] = npc['structural_category'].replace(name_replace)


In [None]:
ak1_category_counts = ak1['structural_category'].value_counts()
ak1_category_percentages = ak1_category_counts / len(ak1) * 100

ipsc_category_counts = ipsc['structural_category'].value_counts()
ipsc_category_percentages = ipsc_category_counts / len(ipsc) * 100

npc_category_counts = npc['structural_category'].value_counts()
npc_category_percentages = npc_category_counts / len(npc) * 100

In [None]:
fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ak1_category_percentages.values, y=ak1_category_percentages.index, color=sample_palette['AK1'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('AK1 MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('AK1_isoform_category_GRCh38.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ipsc_category_percentages.values, y=ipsc_category_percentages.index, color=sample_palette['iPSC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('iPSC MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('iPSC_isoform_category_GRCh38.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=npc_category_percentages.values, y=npc_category_percentages.index, color=sample_palette['NPC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('NPC MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('NPC_isoform_category_GRCh38.png')
plt.clf()

### T2T-CHM13v2.0

In [None]:
ak1 = pd.read_table('./AK1_chm13_collapsed_classification.filtered_lite_classification.txt')
ipsc = pd.read_table('./iPSC_chm13_collapsed_classification.filtered_lite_classification.txt')
npc = pd.read_table('./NPC_chm13_collapsed_classification.filtered_lite_classification.txt')

name_replace = {'novel_not_in_catalog': 'NNC', 'incomplete-splice_match': 'ISM', 'full-splice_match': 'FSM', 'novel_in_catalog': 'NIC', 'intergenic': 'Intergenic', 'antisense': 'Antisense', 'fusion': 'Fusion', 'genic': 'Genic', 'moreJunctions': 'Others'}
name_order = ['NNC', 'NIC', 'FSM', 'ISM', 'Genic', 'Intergenic', 'Antisense', 'Fusion', 'Others']
ak1['structural_category'] = ak1['structural_category'].replace(name_replace)
ipsc['structural_category'] = ipsc['structural_category'].replace(name_replace)
npc['structural_category'] = npc['structural_category'].replace(name_replace)

In [None]:
ak1_category_counts = ak1['structural_category'].value_counts()
ak1_category_percentages = ak1_category_counts / len(ak1) * 100

ipsc_category_counts = ipsc['structural_category'].value_counts()
ipsc_category_percentages = ipsc_category_counts / len(ipsc) * 100

npc_category_counts = npc['structural_category'].value_counts()
npc_category_percentages = npc_category_counts / len(npc) * 100

In [None]:
fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ak1_category_percentages.values, y=ak1_category_percentages.index, color=sample_palette['AK1'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('AK1 MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('AK1_isoform_category_CHM13.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ipsc_category_percentages.values, y=ipsc_category_percentages.index, color=sample_palette['iPSC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('iPSC MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('iPSC_isoform_category_CHM13.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=npc_category_percentages.values, y=npc_category_percentages.index, color=sample_palette['NPC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('NPC MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('NPC_isoform_category_CHM13.png')
plt.clf()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font="arial", font_scale=1.15, style='ticks')
plt.rcParams['figure.figsize'] = (6,6)
plt.rc("axes.spines", top=False, right=False)

ak1 = pd.read_table('./AK1_collapsed_classification.filtered_lite_classification.txt')
ipsc = pd.read_table('./iPSC_collapsed_classification.filtered_lite_classification.txt')
npc = pd.read_table('./NPC_collapsed_classification.filtered_lite_classification.txt')

name_replace = {'novel_not_in_catalog': 'NNC', 'incomplete-splice_match': 'ISM', 'full-splice_match': 'FSM', 'novel_in_catalog': 'NIC', 'intergenic': 'Intergenic', 'antisense': 'Antisense', 'fusion': 'Fusion', 'genic': 'Genic', 'moreJunctions': 'Others'}
name_order = ['NNC', 'NIC', 'FSM', 'ISM', 'Genic', 'Intergenic', 'Antisense', 'Fusion', 'Others']
ak1['structural_category'] = ak1['structural_category'].replace(name_replace)
ipsc['structural_category'] = ipsc['structural_category'].replace(name_replace)
npc['structural_category'] = npc['structural_category'].replace(name_replace)

ak1_category_counts = ak1['structural_category'].value_counts()
ak1_category_percentages = ak1_category_counts / len(ak1) * 100

ipsc_category_counts = ipsc['structural_category'].value_counts()
ipsc_category_percentages = ipsc_category_counts / len(ipsc) * 100

npc_category_counts = npc['structural_category'].value_counts()
npc_category_percentages = npc_category_counts / len(npc) * 100

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ak1_category_percentages.values, y=ak1_category_percentages.index, color=sample_palette['AK1'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('AK1 MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('AK1_isoform_category_GRCh38.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ipsc_category_percentages.values, y=ipsc_category_percentages.index, color=sample_palette['iPSC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('iPSC MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('iPSC_isoform_category_GRCh38.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=npc_category_percentages.values, y=npc_category_percentages.index, color=sample_palette['NPC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('NPC MAS-Iso-Seq (GRCh38)')
ax.set_xlim(0, 35)
plt.savefig('NPC_isoform_category_GRCh38.png')
plt.clf()

ak1 = pd.read_table('./AK1_chm13_collapsed_classification.filtered_lite_classification.txt')
ipsc = pd.read_table('./iPSC_chm13_collapsed_classification.filtered_lite_classification.txt')
npc = pd.read_table('./NPC_chm13_collapsed_classification.filtered_lite_classification.txt')

name_replace = {'novel_not_in_catalog': 'NNC', 'incomplete-splice_match': 'ISM', 'full-splice_match': 'FSM', 'novel_in_catalog': 'NIC', 'intergenic': 'Intergenic', 'antisense': 'Antisense', 'fusion': 'Fusion', 'genic': 'Genic', 'moreJunctions': 'Others'}
name_order = ['NNC', 'NIC', 'FSM', 'ISM', 'Genic', 'Intergenic', 'Antisense', 'Fusion', 'Others']
ak1['structural_category'] = ak1['structural_category'].replace(name_replace)
ipsc['structural_category'] = ipsc['structural_category'].replace(name_replace)
npc['structural_category'] = npc['structural_category'].replace(name_replace)

ak1_category_counts = ak1['structural_category'].value_counts()
ak1_category_percentages = ak1_category_counts / len(ak1) * 100

ipsc_category_counts = ipsc['structural_category'].value_counts()
ipsc_category_percentages = ipsc_category_counts / len(ipsc) * 100

npc_category_counts = npc['structural_category'].value_counts()
npc_category_percentages = npc_category_counts / len(npc) * 100

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ak1_category_percentages.values, y=ak1_category_percentages.index, color=sample_palette['AK1'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('AK1 MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('AK1_isoform_category_CHM13.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=ipsc_category_percentages.values, y=ipsc_category_percentages.index, color=sample_palette['iPSC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('iPSC MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('iPSC_isoform_category_CHM13.png')
plt.clf()

fig, ax = plt.subplots(figsize=(7, 5), constrained_layout=True)
sns.barplot(x=npc_category_percentages.values, y=npc_category_percentages.index, color=sample_palette['NPC'], order=name_order, ax=ax)
ax.set_xlabel('Percentage of Isoforms')
ax.set_ylabel('Structural Cateogry of Isoforms')
ax.set_title('NPC MAS-Iso-Seq (T2T-CHM13v2.0)')
ax.set_xlim(0, 35)
plt.savefig('NPC_isoform_category_CHM13.png')
plt.clf()

### Coding Probability

In [None]:
'''
ak1_prob = pd.read_table("AK1.ORF_prob.best.tsv", index_col=0)

fig, ax = plt.subplots(figsize=(10,8), constrained_layout=True)
sns.scatterplot(data=ak1_prob, x='mRNA', y='Coding_prob', alpha=0.1, ax=ax)
sns.histplot(ak1_prob['mRNA'], ax=ax, fill=True, bins=30)
ax2 = ax.twinx()
sns.histplot(ak1_prob['Coding_prob'], ax=ax2, fill=True, bins=30)
plot = sns.jointplot(data=ak1_prob, x='mRNA', y='Coding_prob', alpha=0.01, marginal_kws=dict(bins=30, fill=False), s=2)
plot.set_axis_labels('Isoform Length (bp)', 'Coding Probability')
'''
pb_prob = pd.read_table("Merged.ORF_prob.best.filtered.tsv", index_col=0)

fig, ax = plt.subplots(figsize=(10,8), constrained_layout=True)
sns.scatterplot(data=pb_prob, x='mRNA', y='Coding_prob', alpha=0.1, ax=ax)
sns.histplot(pb_prob['mRNA'], ax=ax, fill=True, bins=30)
ax2 = ax.twinx()
sns.histplot(pb_prob['Coding_prob'], ax=ax2, fill=True, bins=30)
plot = sns.jointplot(data=pb_prob, x='mRNA', y='Coding_prob', alpha=0.01, marginal_kws=dict(bins=30, fill=False), s=2)
plot.set_axis_labels('Isoform Length (bp)', 'Coding Probability')

p = sns.kdeplot(data=pb_prob, x='mRNA', y='Coding_prob', cmap="afmhot", fill=True, thresh=0)
p.set_xlabel("Isoform Length (bp)")
p.set_ylabel("Coding Probability")
p.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
p.set_xlim((pb_prob['mRNA'].min(), pb_prob['mRNA'].max()))
p.set_ylim((0,1))
plt.savefig('Isoform_Lenght_Coding_Probability.pdf')
plt.clf()