# Isoform protein identification
 The purpose of this analysis is to extract information on the number of genes with multiple isoforms.
### Mass-spectrometry information only
- Number of genes where isoforms were only identified with shared peptides
- Number of genes where a single isoform was identified with unique peptides
- Number of genes where multiple isoforms were identified with unique peptides


In [1]:
import pandas as pd
from huvec_analysis import data_loader, huvec_config
from Bio import SeqIO

import os
# all statistics go into a directory
if not os.path.exists('stats'):
    os.makedirs('stats')


In [2]:

protein_sequences = [] 
for record in SeqIO.parse(f'{huvec_config.PIPELINE_RESULTS_DIRECTORY}/hybrid_protein_database/huvec_hybrid.fasta', 'fasta'):
    acc = record.description.split('|')[1].strip()
    gene_name = record.description.split('GN=')[1]
    val = {
        'accession': acc, 
        'gene' : gene_name,
        'sequence': str(record.seq)
    }
    protein_sequences.append(val)

transcript_abundance = pd.read_table(f'{huvec_config.PIPELINE_RESULTS_DIRECTORY}/hybrid_protein_database/huvec_refined_high_confidence.tsv', usecols=['base_acc', 'CPM'])


In [3]:
huvec_peptides = data_loader.read_peptide_file(data_loader.pacbio_hybrid_peptide_file, data_loader.hybrid_gene_map)
huvec_peptides['is_high_confidence'] =huvec_peptides['accs'].apply(lambda accs: data_loader.is_high_confidence(accs, data_loader.accs_in_hiconf_space))

# huvec_peptides['is_isoform_distinct'] = huvec_peptides['accs'].apply(lambda accs: len(accs) == 1)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
def find_accessions_and_genes(peptide, protein_sequences):
    accessions = []
    genes = []
    if '|' in peptide:
        all_peptides = set(peptide.split('|'))
        for val in protein_sequences:
            for pep in all_peptides:
                if pep in val['sequence']:
                    accessions.append(val['accession'])
                    genes.append(val['gene'])
                    continue
    else:
        for val in protein_sequences:
            if peptide in val['sequence']:
                accessions.append(val['accession'])
                genes.append(val['gene'])
    return [accessions, genes]



#huvec_peptides = huvec_peptides.head().copy()
huvec_peptides['accessions_genes'] = huvec_peptides['Base Sequence'].apply(lambda peptide: find_accessions_and_genes(peptide, protein_sequences))
huvec_peptides['accessions'] = huvec_peptides['accessions_genes'].apply(lambda x : x[0])
huvec_peptides['genes'] = huvec_peptides['accessions_genes'].apply(lambda x: x[1])
huvec_peptides['num_accessions'] = huvec_peptides['accessions'].apply(len)
huvec_peptides['is_peptide_uniquely_mapping'] = huvec_peptides['num_accessions'] == 1 

huvec_peptides['is_isoform_distinct'] = huvec_peptides['is_peptide_uniquely_mapping']



### Make peptide table
Peptide table contains columns in AllPeptides.tsv
- filtered to only include target peptides
- filtered to only include peptides with QValue < 0.01
- map peptide Base Sequence to fasta database


In [5]:
huvec_peptides['accessions'] = huvec_peptides['accessions_genes'].apply(lambda x : x[0])
huvec_peptides['genes'] = huvec_peptides['accessions_genes'].apply(lambda x: x[1])
huvec_peptides['num_accessions'] = huvec_peptides['accessions'].apply(len)
huvec_peptides['is_peptide_uniquely_mapping'] = huvec_peptides['num_accessions'] == 1 

huvec_peptides['is_isoform_distinct'] = huvec_peptides['is_peptide_uniquely_mapping']
huvec_peptides['accessions'] = huvec_peptides['accessions'].apply(lambda x: '|'.join(x))
huvec_peptides['genes'] = huvec_peptides['genes'].apply(lambda x: '|'.join(x))
huvec_peptides.to_csv('./stats/huvec_peptides_mapped_accessions.tsv', sep='\t', index=False)

In [6]:



huvec_peptides = pd.read_table('./stats/huvec_peptides_mapped_accessions.tsv')
huvec_peptides['accessions'] = huvec_peptides['accessions'].apply(lambda x: str(x).split('|'))
huvec_peptides['genes'] = huvec_peptides['genes'].apply(lambda x: str(x).split('|'))

huvec_peptides['accessions_genes'] = huvec_peptides.apply(lambda row: [row['accessions'], row['genes']] ,axis=1)
huvec_peptides['num_accessions'] = huvec_peptides['accessions'].apply(len)
huvec_peptides['is_peptide_uniquely_mapping'] = huvec_peptides['num_accessions'] == 1 

huvec_peptides['is_isoform_distinct'] = huvec_peptides['is_peptide_uniquely_mapping']



In [7]:
huvec_peptides = pd.read_table('./stats/huvec_peptides_mapped_accessions.tsv')
huvec_peptides['accessions'] = huvec_peptides['accessions'].apply(lambda x: str(x).split('|'))
huvec_peptides['genes'] = huvec_peptides['genes'].apply(lambda x: str(x).split('|'))

huvec_peptides['accessions_genes'] = huvec_peptides.apply(lambda row: [row['accessions'], row['genes']] ,axis=1)



### Make isoform table
The isoform table contains information of the unique and shared peptides identified through mass-spectrometry for each isoform. Also included is the transcript abundance for PacBio isoforms.

In [8]:

huvec_peptides['num_isoform_multimap'] = huvec_peptides['accessions'].apply(lambda x: [len(x)] * len(x))
huvec_peptides['Sequence'] = huvec_peptides['Base Sequence']
exploded = huvec_peptides.set_index(['Sequence'])[['accessions', 'genes', 'num_isoform_multimap']].apply(pd.Series.explode).reset_index()
isoform_unique_peptides = exploded[exploded['num_isoform_multimap'] == 1].groupby(['accessions', 'genes'])['Sequence'].apply(list).reset_index()
isoform_unique_peptides.rename(columns = {'Sequence' : 'unique_peptides'}, inplace=True)
isoform_shared_peptides = exploded[exploded['num_isoform_multimap'] > 1].groupby(['accessions', 'genes'])['Sequence'].apply(list).reset_index()
isoform_shared_peptides.rename(columns = {'Sequence' : 'shared_peptides'}, inplace=True)

huvec_isoforms = isoform_unique_peptides.merge(isoform_shared_peptides, how = 'outer', on = ['accessions','genes'])
huvec_isoforms['is_confirmed_ms'] = ~huvec_isoforms['unique_peptides'].isna()
huvec_isoforms.rename(columns={'accessions' : 'accession', 'genes': 'gene'}, inplace=True)
huvec_isoforms['unique peptides'] = huvec_isoforms['unique_peptides'].apply(lambda x: '|'.join(x) if type(x) is list else '')
huvec_isoforms['shared peptides'] = huvec_isoforms['shared_peptides'].apply(lambda x: '|'.join(x) if type(x) is list else '')
huvec_isoforms = huvec_isoforms.merge(transcript_abundance[['base_acc', 'CPM']], left_on='accession', right_on='base_acc', how = 'left')

huvec_isoforms.drop(columns=['unique_peptides', 'shared_peptides', 'base_acc'], inplace=True)
huvec_isoforms.to_csv('./stats/huvec_isoforms_with_found_peptides.tsv', sep='\t', index=False)

### Make gene table
The gene table contains information on isoforms confirmed with mass spectrometry as well as isoforms that only contain shared peptides.

In [9]:
confirmed_isoforms = huvec_isoforms[huvec_isoforms['is_confirmed_ms']][['accession', 'gene']]
confirmed_gene_isoforms = confirmed_isoforms.groupby('gene')['accession'].apply(list).reset_index(name = 'isoforms_with_unique_peptides')
confirmed_gene_isoforms['number_of_isoforms_with_unique_peptides'] = confirmed_gene_isoforms['isoforms_with_unique_peptides'].apply(len)
confirmed_gene_isoforms['isoforms_with_unique_peptides'] = confirmed_gene_isoforms['isoforms_with_unique_peptides'].apply(lambda x: '|'.join(x))

shared_isoforms = huvec_isoforms[~huvec_isoforms['is_confirmed_ms']][['accession', 'gene']]
shared_gene_isoforms = shared_isoforms.groupby('gene')['accession'].apply(list).reset_index(name = 'isoforms_with_only_shared_peptides')
shared_gene_isoforms['number_of_isoforms_with_shared_only_peptides'] = shared_gene_isoforms['isoforms_with_only_shared_peptides'].apply(len)
shared_gene_isoforms['isoforms_with_only_shared_peptides'] = shared_gene_isoforms['isoforms_with_only_shared_peptides'].apply(lambda x: '|'.join(x))


huvec_genes = pd.merge(confirmed_gene_isoforms, shared_gene_isoforms, on='gene', how = 'outer')
huvec_genes['isoforms_with_unique_peptides'].fillna('', inplace=True)
huvec_genes['isoforms_with_only_shared_peptides'].fillna('', inplace=True)
huvec_genes['number_of_isoforms_with_unique_peptides'].fillna(0, inplace=True)
huvec_genes['number_of_isoforms_with_shared_only_peptides'].fillna(0, inplace=True)
huvec_genes = huvec_genes.astype({"number_of_isoforms_with_unique_peptides":'int', "number_of_isoforms_with_shared_only_peptides":'int'}) 
isoforms_in_database = pd.DataFrame(protein_sequences)
genes_database_sizes = isoforms_in_database.groupby('gene').size().reset_index(name='number_isoforms_in_database')
huvec_genes = huvec_genes.merge(genes_database_sizes, on = 'gene', how = 'left' )
huvec_genes.to_csv('./stats/huvec_genes_found_in_mass_spec.tsv', sep='\t', index=False)


### Statistics on genes based on isoform specific peptide evidence

In [10]:
print("Number of genes found through mass spec")
print(len(huvec_genes))


Number of genes found through mass spec
10444


In [11]:
print('All genes: PB high confidence + Gencode')
print('The number of genes where there are no unique peptides mapping to any isoform')
print(len(huvec_genes[huvec_genes['isoforms_with_unique_peptides'] == '']))


All genes: PB high confidence + Gencode
The number of genes where there are no unique peptides mapping to any isoform
5993


In [12]:
print("Number of genes with at least one unique isoform")
print(len(
    huvec_genes[
        (huvec_genes['number_of_isoforms_with_unique_peptides'] > 0)]
    ))
print("Number of genes with confirmed isoforms where only one isoform is in database")
print(len(
    huvec_genes[
        (huvec_genes['number_of_isoforms_with_unique_peptides'] > 0) &
        (huvec_genes['number_isoforms_in_database'] == 1)]
    ))
print("Number of genes with unique isoforms where  multiple isoforms are in the database")
print(len(
    huvec_genes[
        (huvec_genes['number_of_isoforms_with_unique_peptides'] > 0) &
        (huvec_genes['number_isoforms_in_database']  > 1)]
    ))



Number of genes with at least one unique isoform
4451
Number of genes with confirmed isoforms where only one isoform is in database
1748
Number of genes with unique isoforms where  multiple isoforms are in the database
2703


In [13]:
print("Breakdown of number of isoforms confirmed per gene")
huvec_genes.groupby('number_of_isoforms_with_unique_peptides').size().reset_index(name='number_of_genes')

Breakdown of number of isoforms confirmed per gene


Unnamed: 0,number_of_isoforms_with_unique_peptides,number_of_genes
0,0,5993
1,1,4345
2,2,97
3,3,8
4,7,1
