# Protein SQANTI Classification

Generate statistics on SQANTI classification 

In [1]:
import os

import pandas as pd
import os
import matplotlib
from huvec_analysis import huvec_config

matplotlib.rc('font', **huvec_config.font)

# all statistics go into a directory
if not os.path.exists('stats'):
    os.makedirs('stats')

In [2]:

# file paths to the data 
sqanti_info_path = '../00_pre_analysis/sqanti_info/sqanti_info.tsv'
human_ec_gene_path = f'{huvec_config.REFERENCE_DIRECTORY}/human_ec_genes_from_karen.txt'
human_ec_genes = pd.read_table(human_ec_gene_path, header = None)[0].to_list()
sqanti_info = pd.read_table(sqanti_info_path)
sqanti_info = sqanti_info[sqanti_info['cpm']> 1]

###################
# MAIN FIGUREs
##################
with open('stats/01_number_of_protein_coding_genes_w_pacbio_reads.txt', 'w') as ofile: 
    ofile.write(f'Number of pacbio transcripts aligned to protein-coding genes: {len(sqanti_info)}\n')
    ofile.write(f'Number of protein-coding genes with a pacbio transcript: {len(sqanti_info.gene.unique())}')

for line in open('stats/number_of_protein_coding_genes_w_pacbio_reads.txt'):
    print(line.strip('\n'))

Number of pacbio transcripts aligned to protein-coding genes: 53863
Number of protein-coding genes with a pacbio transcript: 10426


In [3]:

# get the number of sqanti cats
fsm = sqanti_info[sqanti_info['cat']=='FSM']
nnc = sqanti_info[sqanti_info['cat']=='NNC']
nic = sqanti_info[sqanti_info['cat'] == 'NIC']
fsm_median = fsm['log2cpm'].median()
nic_median = nic['log2cpm'].median()
nnc_median = nnc['log2cpm'].median()

# Percentage stats for the novelty categories for FSM, NNC, NIC
total = len(sqanti_info)
fsm_percentage = len(fsm)/total
nic_percentage = len(nic)/total
nnc_percentage = len(nnc)/total

# Output the stats to a file 
# number of novel isoforms (NNC and NIC)
novel_isoforms = len(nnc) + len(nic)

# percentage of the novel compared to the known 
novel_percentage = novel_isoforms/total

# Write the percentages for the novelty categories and percents to a text file
with open('stats/number_and_percent_sqanti_trans_detected.txt', 'w') as ofile: 
    ofile.write(f'Number of total isoforms: {total}\n')
    ofile.write(f'Number of those isoforms that are known (full-splice matches (FSMs): {len(fsm)}\n')
    ofile.write(f'Number of those isoforms that are novel (novel in catalog (NICs): {len(nic)}\n')
    ofile.write(f'Number of those isoforms that are novel (novel not in catalog (NNCs): {len(nnc)}\n')
    ofile.write(f'Percentage of those isoforms that are known (full-splice matches (FSMs): {fsm_percentage}\n')
    ofile.write(f'Percentage of those isoforms that are novel (novel in catalog (NICs): {nic_percentage}\n')
    ofile.write(f'Percentage of those isoforms that are novel (novel not in catalog: {nnc_percentage}\n')
    ofile.write(f'Percentage of transcripts classified as  novel:{novel_percentage}\n')
    ofile.write(f'Percentage of transcripts classified as known: {fsm_percentage}\n')
    ofile.write(f'Median CPM for the FSMs: {fsm_median}\n')
    ofile.write(f'Median CPM for the NICs:{nic_median}\n')
    ofile.write(f'Median CPM for the NNCs: {nnc_median}\n')

# output these numbers to a txt file 
for line in open('stats/number_and_percent_sqanti_trans_detected.txt'):
    print(line.strip('\n'))

Number of total isoforms: 53863
Number of those isoforms that are known (full-splice matches (FSMs): 31668
Number of those isoforms that are novel (novel in catalog (NICs): 13746
Number of those isoforms that are novel (novel not in catalog (NNCs): 8449
Percentage of those isoforms that are known (full-splice matches (FSMs): 0.5879360600040844
Percentage of those isoforms that are novel (novel in catalog (NICs): 0.255203015056718
Percentage of those isoforms that are novel (novel not in catalog: 0.1568609249391976
Percentage of transcripts classified as  novel:0.41206393999591556
Percentage of transcripts classified as known: 0.5879360600040844
Median CPM for the FSMs: 2.3603986261578846
Median CPM for the NICs:1.5258689386576283
Median CPM for the NNCs: 1.3240506545760629
