In [1]:
import polars as pl

In [2]:
unprot_sprot = pl.scan_parquet(
    "../data/uniprot_sprot_taxonomy_summary_with_domains.parquet"
)
unprot_sprot.head().collect()

organism,reviewed_count,unreviewed_count,pdb_structures_count,organism_id,lineage,type,type_merge_microbes,superdomain,domain,kingdom
str,u32,u32,u32,i64,str,str,str,str,str,str
"""Homo sapiens""",20417,0,8511,9606,"""Catarrhini; Chordata; Craniata…","""Animal""","""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Mus musculus""",17228,0,2458,10090,"""Chordata; Craniata; Euarchonto…","""Animal""","""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Arabidopsis thaliana""",16396,0,1057,3702,"""Arabidopsis; Brassicaceae; Bra…","""Plant""","""Plant""","""Cellular Life""","""Eukaryota""","""Plant"""
"""Rattus norvegicus""",8209,0,724,10116,"""Chordata; Craniata; Euarchonto…","""Animal""","""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Saccharomyces cerevisiae (stra…",6733,0,2114,559292,"""Ascomycota; Dikarya; Eukaryota…","""Fungi""","""Fungi""","""Cellular Life""","""Eukaryota""","""Fungi"""


In [3]:
unprot_trembl = pl.scan_parquet("../data/uniprot_trembl_taxonomy_summary.parquet")

unprot_trembl.head().collect()

organism,organism_id,lineage,type,reviewed_count,unreviewed_count,pdb_structures_count,type_merge_microbes,superdomain,domain,kingdom
str,i64,str,str,u32,u32,u32,str,str,str,str
"""Influenza A virus""",1416034,"""Viruses; Riboviria; Orthornavi…","""Viruses""",0,4,0,"""Microbial""","""Non-cellular Life""","""Viruses""","""Viruses"""
"""Rhodomicrobium sp. MT211""",1053477,"""Bacteria; Pseudomonadota; Alph…","""Bacteria""",0,2,0,"""Microbial""","""Cellular Life""","""Bacteria""","""Monera"""
"""Cordyla sp. BIOUG28871-D07""",2360621,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,1,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Neanuridae sp. NAMIB320""",2710230,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,1,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Chloroflexus sp. WC5-1""",1276149,"""Bacteria; Bacillati; Chlorofle…","""Bacteria""",0,2,0,"""Microbial""","""Cellular Life""","""Bacteria""","""Monera"""


In [4]:
"organism	organism_id	lineage	type type_merge_microbes	superdomain	domain	kingdom".split()

['organism',
 'organism_id',
 'lineage',
 'type',
 'type_merge_microbes',
 'superdomain',
 'domain',
 'kingdom']

In [5]:
%%time

uniprot_combined = (
    pl.concat([unprot_sprot, unprot_trembl.select(unprot_sprot.columns)])
    .group_by(
        [
            "organism",
            "organism_id",
            "lineage",
            "type",
            "type_merge_microbes",
            "superdomain",
            "domain",
            "kingdom",
        ]
    )
    .agg(
        pl.col("pdb_structures_count").sum(),
        pl.col("reviewed_count").sum(),
        pl.col("unreviewed_count").sum(),
    )
    .sort("pdb_structures_count", descending=True)
    .collect()
)
uniprot_combined

CPU times: user 472 ms, sys: 380 ms, total: 852 ms
Wall time: 176 ms




organism,organism_id,lineage,type,type_merge_microbes,superdomain,domain,kingdom,pdb_structures_count,reviewed_count,unreviewed_count
str,i64,str,str,str,str,str,str,u32,u32,u32
"""Homo sapiens""",9606,"""Catarrhini; Chordata; Craniata…","""Animal""","""Animal""","""Cellular Life""","""Eukaryota""","""Animal""",8511,20417,0
"""Mus musculus""",10090,"""Chordata; Craniata; Euarchonto…","""Animal""","""Animal""","""Cellular Life""","""Eukaryota""","""Animal""",2458,17228,0
"""Saccharomyces cerevisiae (stra…",559292,"""Ascomycota; Dikarya; Eukaryota…","""Fungi""","""Fungi""","""Cellular Life""","""Eukaryota""","""Fungi""",2114,6733,0
"""Escherichia coli (strain K12)""",83333,"""Bacteria; Enterobacterales; En…","""Bacteria""","""Microbial""","""Cellular Life""","""Bacteria""","""Monera""",1732,4531,0
"""Arabidopsis thaliana""",3702,"""Arabidopsis; Brassicaceae; Bra…","""Plant""","""Plant""","""Cellular Life""","""Eukaryota""","""Plant""",1057,16396,0
…,…,…,…,…,…,…,…,…,…,…
"""HIV-1 M:G_K1184""",1243510,"""Viruses; Riboviria; Pararnavir…","""Viruses""","""Microbial""","""Non-cellular Life""","""Viruses""","""Viruses""",0,0,1
"""Sennertia sp. BIOUG03935-H09""",2444981,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""","""Animal""","""Cellular Life""","""Eukaryota""","""Animal""",0,0,1
"""Influenza A virus""",1593641,"""Viruses; Riboviria; Orthornavi…","""Viruses""","""Microbial""","""Non-cellular Life""","""Viruses""","""Viruses""",0,0,12
"""Orthocladiinae sp. BIOUG20642-…",2198333,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""","""Animal""","""Cellular Life""","""Eukaryota""","""Animal""",0,0,1


### How many total proteins with a PDB structure, across both reviewed and unreviewed?

In [6]:
uniprot_combined["pdb_structures_count"].sum()

66871

In [7]:
uniprot_combined.write_parquet(
    "../data/uniprot_combined_sprot_trembl_taxonomy_summary.parquet"
)

In [8]:
uniprot_combined.select(pl.col(["reviewed_count", "unreviewed_count"]).sum())

reviewed_count,unreviewed_count
u32,u32
572970,252633201


### How many from human? -> Human has 8511 proteins with PDB structures

In [9]:
100 * 8511 / uniprot_combined["pdb_structures_count"].sum()

12.727490242407022

## What percent of all proteins does uniprot represent?

In [10]:
total_uniprot = 252633201 + 572970
f"{total_uniprot:,}"

'253,206,171'

In [11]:
n_predicted_proteins = 14e12
f"{n_predicted_proteins:,}"

'14,000,000,000,000.0'

In [12]:
total_uniprot / n_predicted_proteins

1.808615507142857e-05

### Show as a percent

In [13]:
f"{100*total_uniprot /n_predicted_proteins:.5f}"

'0.00181'