In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import polars as pl
import seaborn as sns
from ipywidgets import widgets

In [4]:
uniprot_sprot = pd.read_parquet("../data/uniprot_sprot_taxonomy_summary.parquet")
print(uniprot_sprot.shape)


def merge_microbes(species_type):
    if species_type in {"Bacteria", "Archaea", "Viruses"}:
        return "Microbial"
    else:
        return species_type.title()


def assign_superdomain(species_type):
    if species_type == "Viruses":
        return "Non-cellular Life"
    else:
        return "Cellular Life"


def assign_domain(species_type):
    if species_type in {"Bacteria", "Archaea", "Viruses"}:
        return species_type
    else:
        return "Eukaryota"


def assign_kingdom(species_type):
    if species_type in {"Bacteria", "Archaea"}:
        return "Monera"
    else:
        return species_type


uniprot_sprot["type_merge_microbes"] = uniprot_sprot["type"].apply(merge_microbes)
uniprot_sprot["superdomain"] = uniprot_sprot["type"].apply(assign_superdomain)
uniprot_sprot["domain"] = uniprot_sprot["type"].apply(assign_domain)
uniprot_sprot["kingdom"] = uniprot_sprot["type"].apply(assign_kingdom)
uniprot_sprot.head()

(14755, 7)


Unnamed: 0,organism,reviewed_count,unreviewed_count,pdb_structures_count,organism_id,lineage,type,type_merge_microbes,superdomain,domain,kingdom
0,Homo sapiens,20417,0,8511,9606,Catarrhini; Chordata; Craniata; Euarchontoglir...,Animal,Animal,Cellular Life,Eukaryota,Animal
1,Mus musculus,17228,0,2458,10090,Chordata; Craniata; Euarchontoglires; Eukaryot...,Animal,Animal,Cellular Life,Eukaryota,Animal
2,Arabidopsis thaliana,16396,0,1057,3702,Arabidopsis; Brassicaceae; Brassicales; Cameli...,Plant,Plant,Cellular Life,Eukaryota,Plant
3,Rattus norvegicus,8209,0,724,10116,Chordata; Craniata; Euarchontoglires; Eukaryot...,Animal,Animal,Cellular Life,Eukaryota,Animal
4,Saccharomyces cerevisiae (strain ATCC 204508 /...,6733,0,2114,559292,Ascomycota; Dikarya; Eukaryota; Fungi; Sacchar...,Fungi,Fungi,Cellular Life,Eukaryota,Fungi


### What percentage of the structures are from human?

In [5]:
uniprot_sprot.pdb_structures_count.sum()

np.uint64(36768)

In [6]:
8511 / uniprot_sprot.pdb_structures_count.sum()

np.float64(0.2314784595300261)

In [9]:
uniprot_sprot.to_parquet("../data/uniprot_sprot_taxonomy_summary_with_domains.parquet")

In [10]:
uniprot_sprot.sort_values("reviewed_count", ascending=False).head(10)

Unnamed: 0,organism,reviewed_count,unreviewed_count,pdb_structures_count,organism_id,lineage,type,type_merge_microbes,superdomain,domain,kingdom
0,Homo sapiens,20417,0,8511,9606,Catarrhini; Chordata; Craniata; Euarchontoglir...,Animal,Animal,Cellular Life,Eukaryota,Animal
1,Mus musculus,17228,0,2458,10090,Chordata; Craniata; Euarchontoglires; Eukaryot...,Animal,Animal,Cellular Life,Eukaryota,Animal
2,Arabidopsis thaliana,16396,0,1057,3702,Arabidopsis; Brassicaceae; Brassicales; Cameli...,Plant,Plant,Cellular Life,Eukaryota,Plant
3,Rattus norvegicus,8209,0,724,10116,Chordata; Craniata; Euarchontoglires; Eukaryot...,Animal,Animal,Cellular Life,Eukaryota,Animal
4,Saccharomyces cerevisiae (strain ATCC 204508 /...,6733,0,2114,559292,Ascomycota; Dikarya; Eukaryota; Fungi; Sacchar...,Fungi,Fungi,Cellular Life,Eukaryota,Fungi
5,Bos taurus,6048,0,560,9913,Artiodactyla; Bos; Bovidae; Bovinae; Chordata;...,Animal,Animal,Cellular Life,Eukaryota,Animal
6,Schizosaccharomyces pombe (strain 972 / ATCC 2...,5121,0,425,284812,Ascomycota; Dikarya; Eukaryota; Fungi; Schizos...,Fungi,Fungi,Cellular Life,Eukaryota,Fungi
7,Escherichia coli (strain K12),4531,0,1732,83333,Bacteria; Enterobacterales; Enterobacteriaceae...,Bacteria,Microbial,Cellular Life,Bacteria,Monera
8,Caenorhabditis elegans,4489,0,278,6239,Caenorhabditis; Chromadorea; Ecdysozoa; Eukary...,Animal,Animal,Cellular Life,Eukaryota,Animal
9,Oryza sativa subsp. japonica,4191,0,103,39947,BOP clade; Embryophyta; Eukaryota; Liliopsida;...,Plant,Plant,Cellular Life,Eukaryota,Plant


In [11]:
uniprot_sprot.sort_values("pdb_structures_count", ascending=False).head(10)

Unnamed: 0,organism,reviewed_count,unreviewed_count,pdb_structures_count,organism_id,lineage,type,type_merge_microbes,superdomain,domain,kingdom
0,Homo sapiens,20417,0,8511,9606,Catarrhini; Chordata; Craniata; Euarchontoglir...,Animal,Animal,Cellular Life,Eukaryota,Animal
1,Mus musculus,17228,0,2458,10090,Chordata; Craniata; Euarchontoglires; Eukaryot...,Animal,Animal,Cellular Life,Eukaryota,Animal
4,Saccharomyces cerevisiae (strain ATCC 204508 /...,6733,0,2114,559292,Ascomycota; Dikarya; Eukaryota; Fungi; Sacchar...,Fungi,Fungi,Cellular Life,Eukaryota,Fungi
7,Escherichia coli (strain K12),4531,0,1732,83333,Bacteria; Enterobacterales; Enterobacteriaceae...,Bacteria,Microbial,Cellular Life,Bacteria,Monera
2,Arabidopsis thaliana,16396,0,1057,3702,Arabidopsis; Brassicaceae; Brassicales; Cameli...,Plant,Plant,Cellular Life,Eukaryota,Plant
10,Bacillus subtilis (strain 168),4191,0,753,224308,Bacillaceae; Bacillales; Bacillati; Bacilli; B...,Bacteria,Microbial,Cellular Life,Bacteria,Monera
15,Mycobacterium tuberculosis (strain ATCC 25618 ...,2325,0,742,83332,Actinomycetes; Actinomycetota; Bacillati; Bact...,Bacteria,Microbial,Cellular Life,Bacteria,Monera
3,Rattus norvegicus,8209,0,724,10116,Chordata; Craniata; Euarchontoglires; Eukaryot...,Animal,Animal,Cellular Life,Eukaryota,Animal
5,Bos taurus,6048,0,560,9913,Artiodactyla; Bos; Bovidae; Bovinae; Chordata;...,Animal,Animal,Cellular Life,Eukaryota,Animal
12,Drosophila melanogaster,3816,0,455,7227,Arthropoda; Brachycera; Diptera; Drosophila; D...,Animal,Animal,Cellular Life,Eukaryota,Animal


In [12]:
uniprot_sprot.groupby(["superdomain", "domain", "kingdom"]).size()

superdomain        domain     kingdom        
Cellular Life      Archaea    Monera              236
                   Bacteria   Monera             3353
                   Eukaryota  Animal             4967
                              Fungi               871
                              Plant              2194
                              other Eukaryota     368
Non-cellular Life  Viruses    Viruses            2766
dtype: int64

In [13]:
# uniprot_sprot.sort_values("n_proteins", ascending=False).head()

In [14]:
uniprot_sprot_summarized_per_organism = uniprot_sprot.groupby("organism").agg(
    {
        "reviewed_count": "sum",
        "organism": "first",
        "organism_id": "first",
        "lineage": "first",
        "type": "first",
        "type_merge_microbes": "first",
        "pdb_structures_count": "sum",
    }
)
print(uniprot_sprot_summarized_per_organism.shape)
uniprot_sprot_summarized_per_organism.sort_values("reviewed_count", ascending=False)

(14755, 7)


Unnamed: 0_level_0,reviewed_count,organism,organism_id,lineage,type,type_merge_microbes,pdb_structures_count
organism,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Homo sapiens,20417,Homo sapiens,9606,Catarrhini; Chordata; Craniata; Euarchontoglir...,Animal,Animal,8511
Mus musculus,17228,Mus musculus,10090,Chordata; Craniata; Euarchontoglires; Eukaryot...,Animal,Animal,2458
Arabidopsis thaliana,16396,Arabidopsis thaliana,3702,Arabidopsis; Brassicaceae; Brassicales; Cameli...,Plant,Plant,1057
Rattus norvegicus,8209,Rattus norvegicus,10116,Chordata; Craniata; Euarchontoglires; Eukaryot...,Animal,Animal,724
Saccharomyces cerevisiae (strain ATCC 204508 / S288c),6733,Saccharomyces cerevisiae (strain ATCC 204508 /...,559292,Ascomycota; Dikarya; Eukaryota; Fungi; Sacchar...,Fungi,Fungi,2114
...,...,...,...,...,...,...,...
Plasmodium falciparum (isolate FCH-5),1,Plasmodium falciparum (isolate FCH-5),132416,Aconoidasida; Alveolata; Apicomplexa; Eukaryot...,other Eukaryota,Other Eukaryota,0
Lilium michiganense,1,Lilium michiganense,84049,Embryophyta; Eukaryota; Liliaceae; Liliales; L...,Plant,Plant,0
Lilium regale,1,Lilium regale,82328,Embryophyta; Eukaryota; Liliaceae; Liliales; L...,Plant,Plant,0
Foot-and-mouth disease virus (isolate Bovine/Brazil/A24Cruzeiro/1955 serotype A),1,Foot-and-mouth disease virus (isolate Bovine/B...,12115,Aphthovirus; Caphthovirinae; Foot-and-mouth di...,Viruses,Microbial,1


In [16]:
uniprot_sprot_n_proteins_per_organism = (
    uniprot_sprot.groupby("organism").reviewed_count.sum().sort_values(ascending=False)
)
uniprot_sprot_n_proteins_per_organism.head(10)

organism
Homo sapiens                                             20417
Mus musculus                                             17228
Arabidopsis thaliana                                     16396
Rattus norvegicus                                         8209
Saccharomyces cerevisiae (strain ATCC 204508 / S288c)     6733
Bos taurus                                                6048
Schizosaccharomyces pombe (strain 972 / ATCC 24843)       5121
Escherichia coli (strain K12)                             4531
Caenorhabditis elegans                                    4489
Bacillus subtilis (strain 168)                            4191
Name: reviewed_count, dtype: uint32

In [17]:
uniprot_sprot.type_merge_microbes.value_counts()

type_merge_microbes
Microbial          6355
Animal             4967
Plant              2194
Fungi               871
Other Eukaryota     368
Name: count, dtype: int64

In [19]:
uniprot_sprot_n_proteins = uniprot_sprot.groupby(
    "type_merge_microbes"
).reviewed_count.sum()
uniprot_sprot_n_proteins.name = "n_proteins_sprot"
uniprot_sprot_n_proteins

type_merge_microbes
Animal             110164
Fungi               37406
Microbial          373984
Other Eukaryota      9540
Plant               41876
Name: n_proteins_sprot, dtype: uint32

### Sanity check that this is equal to the expected number of UniProt/SwissProt Proteins for Release  2025_01

In [22]:
assert uniprot_sprot_n_proteins.sum() == 572970