### Long-read evidenced isoform

How many genes have an isoform confirmed with mass-spectrometry (unique peptide found for an isoform)
and at least 1 more isoform confirmed through transcript abunddance and shared peptides

Transcript abundance > 25 CPM

In [1]:
import pandas as pd

import os
# all statistics go into a directory
if not os.path.exists('stats'):
    os.makedirs('stats')


In [2]:
huvec_isoforms = pd.read_table('./stats/huvec_isoforms_with_found_peptides.tsv')
cpm_cutoff = 25

def is_confirmed_through_transcript_abundance(row, cpm_cutoff):
    return not row['is_confirmed_ms'] and row['CPM'] >= cpm_cutoff


huvec_isoforms['is_confirmed_through_transcript_abundance'] = huvec_isoforms.apply(lambda row: is_confirmed_through_transcript_abundance(row, cpm_cutoff), axis = 1)
huvec_isoforms.to_csv('./stats/huvec_transcript_confirmed_isoforms.tsv', sep='\t', index=False)


## Generate gene table

The gene table captures the breakdown of isoform confirmation 
- confirmed through mass spec : isoform has a unique peptide
- confirmed through transcript abundance: not confirmed through mass spec, but has at least 25 CPM and has shared peptides
- not confirmed: isoform has only shared peptides and has less than 25 CPM


In [3]:


msconfirm = huvec_isoforms[huvec_isoforms['is_confirmed_ms']]
msconfirm_gene = msconfirm.groupby('gene')['accession'].apply(list).reset_index(name='isoforms_confirmed_through_mass_spec')
msconfirm_gene['number_isoforms_confirmed_through_mass_spec'] = msconfirm_gene['isoforms_confirmed_through_mass_spec'].apply(len)
msconfirm_gene['isoforms_confirmed_through_mass_spec'] = msconfirm_gene['isoforms_confirmed_through_mass_spec'].apply(lambda x: '|'.join(x))

tconfirm = huvec_isoforms[huvec_isoforms['is_confirmed_through_transcript_abundance']]
tconfirm_gene = tconfirm.groupby('gene')['accession'].apply(list).reset_index(name='isoforms_confirmed_through_transcript_abundance')
tconfirm_gene['number_isoforms_confirmed_through_transcript_abundance'] = tconfirm_gene['isoforms_confirmed_through_transcript_abundance'].apply(len)
tconfirm_gene['isoforms_confirmed_through_transcript_abundance'] = tconfirm_gene['isoforms_confirmed_through_transcript_abundance'].apply(lambda x: '|'.join(x))

not_confirmed = huvec_isoforms[(~huvec_isoforms['is_confirmed_ms']) & (~huvec_isoforms['is_confirmed_through_transcript_abundance'])]
not_confirmed_gene = not_confirmed.groupby('gene')['accession'].apply(list).reset_index(name='isoforms_not_confirmed')
not_confirmed_gene['number_isoforms_not_confirmed'] = not_confirmed_gene['isoforms_not_confirmed'].apply(len)
not_confirmed_gene['isoforms_not_confirmed'] = not_confirmed_gene['isoforms_not_confirmed'].apply(lambda x: '|'.join(x))

huvec_gene = (
    msconfirm_gene
        .merge(tconfirm_gene, on = 'gene', how ='outer')
        .merge(not_confirmed_gene, on = 'gene', how = 'outer')
)

huvec_gene['isoforms_confirmed_through_mass_spec'].fillna('', inplace=True)
huvec_gene['isoforms_confirmed_through_transcript_abundance'].fillna('', inplace=True)
huvec_gene['isoforms_not_confirmed'].fillna('', inplace=True)

huvec_gene['number_isoforms_confirmed_through_mass_spec'].fillna(0, inplace=True)
huvec_gene['number_isoforms_confirmed_through_transcript_abundance'].fillna(0, inplace=True)
huvec_gene['number_isoforms_not_confirmed'].fillna(0, inplace=True)
huvec_gene = huvec_gene.astype({
    "number_isoforms_confirmed_through_mass_spec":'int', 
    "number_isoforms_confirmed_through_transcript_abundance":'int',
    "number_isoforms_not_confirmed" : 'int'}) 

huvec_gene['has_isoforms_confirmed_mass_spec'] = huvec_gene['number_isoforms_confirmed_through_mass_spec'] > 0
huvec_gene['has_isoforms_confirmed_transcript_abundance'] = huvec_gene['number_isoforms_confirmed_through_transcript_abundance'] > 0

huvec_gene.to_csv('./stats/huvec_transcript_confirmed_genes.tsv', sep='\t', index=False)


#### Breakdown of how many genes have isoforms confirmed through mass spec, transcript abundance, and not confirmed

In [4]:
huvec_gene.groupby(['has_isoforms_confirmed_mass_spec', 'has_isoforms_confirmed_transcript_abundance']).size()

has_isoforms_confirmed_mass_spec  has_isoforms_confirmed_transcript_abundance
False                             False                                          3713
                                  True                                           2280
True                              False                                          4271
                                  True                                            180
dtype: int64

#### Breakdown of how many additional isoforms are confirmed through transcript abundance, given that at least one isoform was confirmed through mass-spec


In [5]:
has_ms_has_transcript = huvec_gene[
    (huvec_gene['has_isoforms_confirmed_mass_spec' ]) & 
    (huvec_gene['has_isoforms_confirmed_transcript_abundance'])
]
has_ms_has_transcript.groupby(['number_isoforms_confirmed_through_transcript_abundance']).size().reset_index(name = 'number_of_genes')

Unnamed: 0,number_isoforms_confirmed_through_transcript_abundance,number_of_genes
0,1,151
1,2,20
2,3,7
3,4,2


### Isoforms with only shared peptides confirmed with long-read info
How many genes have only shared peptides, and thus no confirmed isoforms through mass-specgrometry
but have at least 1 isofrom confirmed through combination of shared peptides and transcript abundance

Transcript abundance > 25 CPM

In [6]:
no_ms_has_transcript = huvec_gene[
    (~huvec_gene['has_isoforms_confirmed_mass_spec' ]) & 
    (huvec_gene['has_isoforms_confirmed_transcript_abundance'])
]
no_ms_has_transcript.groupby(['number_isoforms_confirmed_through_transcript_abundance']).size().reset_index(name='number_of_genes')

Unnamed: 0,number_isoforms_confirmed_through_transcript_abundance,number_of_genes
0,1,2033
1,2,202
2,3,30
3,4,11
4,5,3
5,8,1
