# Enhanced protein isoform characterization through long-read proteogenomics 
## Supplementary Table 1
---
Ben Jordan

In [1]:
import pandas as pd 
import config
filtered_protein_file = f'{config.PIPELINE_RESULTS_DIRECTORY}/protein_filter/{config.EXPERIMENT_NAME}.classification_filtered.tsv' # filtered protein file

protein_db = pd.read_table(filtered_protein_file)

In [2]:
def get_nterm_status(psubclass):
    if 'novel_nterm' in psubclass:
        return 'Novel'
    return 'Known'
def get_splice_status(psubclass):
    if 'known_splice' in psubclass:
        return 'Known'
    if 'combo_splice' in psubclass:
        return 'Combo'
    if 'novel_splice' in psubclass:
        return 'Novel'
    if 'combo_nterm_cterm' == psubclass:
        return 'Known'
    if 'mono-exon' == psubclass:
        return '-'

def get_cterm_status(psubclass):
    if 'novel_cterm' in psubclass:
        return 'Novel'
    return 'Known'

protein_db['N-terminus'] = protein_db['psubclass'].apply(get_nterm_status)
protein_db['Splicing'] = protein_db['psubclass'].apply(get_splice_status)
protein_db['C-terminus'] = protein_db['psubclass'].apply(get_cterm_status)



In [3]:
supp_table_1 = protein_db.groupby(['pclass', 'N-terminus','Splicing','C-terminus']).size().reset_index(name='Number of protein isoforms')

supp_table_1['Note'] = '-'
supp_table_1.at[3,'Note'] = 'Novel combo of N/C-term'

supp_table_1 = supp_table_1.reindex([1,0,3,2,4,5,6,7,8,9,10,11,12,13])
supp_table_1 = supp_table_1.rename(columns={'pclass':'SQANTI Protein class'})
supp_table_1  = supp_table_1[['SQANTI Protein class', 'N-terminus', 'Splicing', 'C-terminus', 'Note', 'Number of protein isoforms']]

supp_table_1.to_csv('stats/SuppTable1.tsv', sep = '\t', index= False)