In [1]:
import gzip

import polars as pl

## Convert tsv to parquet

We created a TSV of UniProt entries using the [uniprot](https://github.com/heuermh/dishevelled-bio/tree/master/protein/src/main/java/org/dishevelled/bio/protein/uniprot) tool from [heuermh/dishevelled-bio](https://github.com/heuermh/dishevelled-bio). This created a 40 GB TSV file from the 219 GB gzipped XML file `uniprot_trembl.xml.gz` downloaded directly from [UniProt](https://www.uniprot.org/help/downloads).

```
uniprot-1.0-SNAPSHOT/bin/uniprot -i uniprot_trembl.xml.gz > uniprot_trembl_entries.tsv
```

First, let's convert this to parquet file so it is more compact (small enough for this github repo!) and better for streaming.

In [2]:
%%time

# with gzip.open("uniprot_trembl_entries.tsv") as f:
df = pl.scan_csv(
    "../data/uniprot_trembl_entries.tsv",
    separator="\t",
    new_columns=[
        "organism",
        "organism_id",
        "lineage",
        "type",
        "reviewed",
        "unreviewed",
        "has_structure",
    ],
    has_header=False,
)
df.sink_parquet("../data/uniprot_trembl_entries.parquet")


More information on the new streaming engine: https://github.com/pola-rs/polars/issues/20947


CPU times: user 58.1 s, sys: 17.6 s, total: 1min 15s
Wall time: 1min 22s


In [3]:
df.head().collect()

organism,organism_id,lineage,type,reviewed,unreviewed,has_structure
str,i64,str,str,bool,bool,bool
"""Pseudomonas rhodesiae""",76760,"""Bacteria; Pseudomonadota; Gamm…","""Bacteria""",False,True,False
"""Gossypium darwinii""",34276,"""Eukaryota; Viridiplantae; Stre…","""Plant""",False,True,False
"""Marmota monax""",9995,"""Eukaryota; Metazoa; Chordata; …","""Animal""",False,True,False
"""Neomarinimicrobiota bacterium""",2026760,"""Bacteria; Candidatus Neomarini…","""Bacteria""",False,True,False
"""Clostridium perfringens""",1502,"""Bacteria; Bacillati; Bacillota…","""Bacteria""",False,True,False


# Read in parquet

In [4]:
uniprot_trembl = pl.scan_parquet("../data/uniprot_trembl_entries.parquet")
uniprot_trembl

## Assign kingdoms, domains, summarize per organism


In [5]:
%%time


def merge_microbes(species_type):
    if species_type in {"Bacteria", "Archaea", "Viruses"}:
        return "Microbial"
    else:
        return species_type.title()


def assign_superdomain(species_type):
    if species_type == "Viruses":
        return "Non-cellular Life"
    else:
        return "Cellular Life"


def assign_domain(species_type):
    if species_type in {"Bacteria", "Archaea", "Viruses"}:
        return species_type
    else:
        return "Eukaryota"


def assign_kingdom(species_type):
    if species_type in {"Bacteria", "Archaea"}:
        return "Monera"
    else:
        return species_type


uniprot_trembl_summarized_per_organism = (
    uniprot_trembl.group_by(["organism", "organism_id", "lineage", "type"])
    .agg(
        [
            pl.col("reviewed").sum().alias("reviewed_count"),
            pl.col("unreviewed").sum().alias("unreviewed_count"),
            pl.col("has_structure")
            .sum()
            .alias("pdb_structures_count"),  # Renamed to be explicit
        ]
    )
    .with_columns(
        pl.col("type")
        .map_elements(merge_microbes, return_dtype=pl.String)
        .alias("type_merge_microbes"),
        pl.col("type")
        .map_elements(assign_superdomain, return_dtype=pl.String)
        .alias("superdomain"),
        pl.col("type")
        .map_elements(assign_domain, return_dtype=pl.String)
        .alias("domain"),
        pl.col("type")
        .map_elements(assign_kingdom, return_dtype=pl.String)
        .alias("kingdom"),
    )
    .collect()
)
print(uniprot_trembl_summarized_per_organism.shape)
uniprot_trembl_summarized_per_organism.sort("reviewed_count", descending=True)

(1318326, 11)
CPU times: user 35.2 s, sys: 37.9 s, total: 1min 13s
Wall time: 16.5 s


organism,organism_id,lineage,type,reviewed_count,unreviewed_count,pdb_structures_count,type_merge_microbes,superdomain,domain,kingdom
str,i64,str,str,u32,u32,u32,str,str,str,str
"""Muscina sp. BIOUG31116-G08""",2362392,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,1,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Billbergia distachya""",1093647,"""Eukaryota; Viridiplantae; Stre…","""Plant""",0,4,0,"""Plant""","""Cellular Life""","""Eukaryota""","""Plant"""
"""Meteoridium tenuissimum""",195109,"""Eukaryota; Viridiplantae; Stre…","""Plant""",0,3,0,"""Plant""","""Cellular Life""","""Eukaryota""","""Plant"""
"""Macropanesthia mackerrasae""",112945,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,15,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Phassus sp. BOLD:AAI2895""",1216432,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,1,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
…,…,…,…,…,…,…,…,…,…,…
"""Hymenoptera sp. BOLD:AAU9795""",1253168,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,1,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Orcus sp. ANIC COC85""",2018437,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,5,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Discartemon hypocrites""",2728048,"""Eukaryota; Metazoa; Spiralia; …","""Animal""",0,1,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Parasitidae sp. BIOUG14509-D03""",2245085,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,1,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""


### Curious: How many unreviewed have PDB structures?

In [6]:
uniprot_trembl_summarized_per_organism["pdb_structures_count"].sum()

30103

### Sanity check: this should be ~250M sequences for UniProt/SwissProt release  2025_01 

In [7]:
uniprot_trembl_summarized_per_organism["unreviewed_count"].sum()

252633201

In [8]:
assert uniprot_trembl_summarized_per_organism["unreviewed_count"].sum() == 252633201

### Write to file!

In [9]:
uniprot_trembl_summarized_per_organism.write_parquet(
    "../data/uniprot_trembl_taxonomy_summary.parquet"
)