# Enhanced protein isoform characterization through long-read proteogenomics 
## Supplementary Table 3
---
Ben Jordan

In [1]:
import pandas as pd
import config, data_loader


In [2]:

peptide = data_loader.Peptide()
protein_group = data_loader.ProteinGroup()

In [3]:
def get_genes(pg):
    genes = set()
    for g in pg['genes']:
        g = set(g)
        genes = genes.union(g)
    return genes


#### All protein coding genes

In [4]:
# All Protein coding genes
supp_table_3 = pd.DataFrame(columns = ['Protein database', 'Peptides', 'Protein Groups', 'Genes', 'Gene space'])

row = {
    'Protein database' : 'GENCODE',
    'Peptides' : len(peptide.gencode_peptide),
    'Protein Groups' : len(protein_group.gencode_protein_group),
    'Genes' : len(get_genes(protein_group.gencode_protein_group)),
    'Gene space' : 'All protein-coding genes'
}
supp_table_3 = supp_table_3.append(row, ignore_index=True)

row = {
    'Protein database' : 'UniProt',
    'Peptides' : len(peptide.uniprot_peptide),
    'Protein Groups' : len(protein_group.uniprot_protein_group),
    'Genes' : len(get_genes(protein_group.uniprot_protein_group)),
    'Gene space' : 'All protein-coding genes'
}
supp_table_3 = supp_table_3.append(row, ignore_index=True)


row = {
    'Protein database' : 'PacBio Hybrid',
    'Peptides' : len(peptide.pacbio_hybrid_peptide),
    'Protein Groups' : len(protein_group.pacbio_hybrid_protein_group),
    'Genes' : len(get_genes(protein_group.pacbio_hybrid_protein_group)),
    'Gene space' : 'All protein-coding genes'
}
supp_table_3 = supp_table_3.append(row, ignore_index=True)


#### High Confidence Region

In [5]:
hc_pep = peptide.gencode_peptide[peptide.gencode_peptide['is_high_confidence']]
hc_pg = protein_group.gencode_protein_group[protein_group.gencode_protein_group['is_high_confidence']]
hc_genes = data_loader.genes_in_hiconf_space.intersection(get_genes(hc_pg))
row = {
    'Protein database' : 'GENCODE',
    'Peptides' : len(hc_pep),
    'Protein Groups' : len(hc_pg),
    'Genes' : len(hc_genes),
    'Gene space' : 'High confidence space (HC space)'
}
supp_table_3 = supp_table_3.append(row, ignore_index=True)


hc_pep = peptide.uniprot_peptide[peptide.uniprot_peptide['is_high_confidence']]
hc_pg = protein_group.uniprot_protein_group[protein_group.uniprot_protein_group['is_high_confidence']]
hc_genes = data_loader.genes_in_hiconf_space.intersection(get_genes(hc_pg))
row = {
    'Protein database' : 'UniProt',
    'Peptides' : len(hc_pep),
    'Protein Groups' : len(hc_pg),
    'Genes' : len(hc_genes),
    'Gene space' : 'High confidence space (HC space)'
}
supp_table_3 = supp_table_3.append(row, ignore_index=True)

hc_pep = peptide.pacbio_hybrid_peptide[peptide.pacbio_hybrid_peptide['is_high_confidence']]
hc_pg = protein_group.pacbio_hybrid_protein_group[protein_group.pacbio_hybrid_protein_group['is_high_confidence']]
hc_genes = data_loader.genes_in_hiconf_space.intersection(get_genes(hc_pg))
row = {
    'Protein database' : 'PacBio Hybrid',
    'Peptides' : len(hc_pep),
    'Protein Groups' : len(hc_pg),
    'Genes' : len(hc_genes),
    'Gene space' : 'High confidence space (HC space)'
}
supp_table_3 = supp_table_3.append(row, ignore_index=True)
supp_table_3.to_csv('stats/SuppTable3.tsv', sep = '\t', index=False)


In [6]:
supp_table_3

Unnamed: 0,Protein database,Peptides,Protein Groups,Genes,Gene space
0,GENCODE,76255,7717,7666,All protein-coding genes
1,UniProt,76718,7623,7524,All protein-coding genes
2,PacBio Hybrid,75750,7702,7641,All protein-coding genes
3,GENCODE,52341,5120,4942,High confidence space (HC space)
4,UniProt,52494,5049,4927,High confidence space (HC space)
5,PacBio Hybrid,51754,5100,4914,High confidence space (HC space)


In [12]:
hc_pep = peptide.gencode_peptide[peptide.gencode_peptide['is_high_confidence']]

In [8]:
len(hc_pep)

52341

In [13]:
gencode_peptides = peptide.gencode_peptide[peptide.gencode_peptide['is_high_confidence']]
gencode_peptide_set = set(gencode_peptides['Full Sequence'].unique())

In [14]:
len(gencode_peptide_set)

52341