In [1]:
import gzip

import polars as pl

## Convert tsv to parquet

We created a TSV of UniProt entries using the [uniprot](https://github.com/heuermh/dishevelled-bio/tree/master/protein/src/main/java/org/dishevelled/bio/protein/uniprot) tool from [heuermh/dishevelled-bio](https://github.com/heuermh/dishevelled-bio). This created a 40 GB TSV file from the 219 GB gzipped XML file `uniprot_trembl.xml.gz` downloaded directly from [UniProt](https://www.uniprot.org/help/downloads).

```
uniprot-1.0-SNAPSHOT/bin/uniprot -i uniprot_trembl.xml.gz > uniprot_trembl_entries.tsv
```

First, let's convert this to parquet file so it is more compact (small enough for this github repo!) and better for streaming.

In [4]:
%%time

# with gzip.open("uniprot_trembl_entries.tsv") as f:
df = pl.scan_csv(
    "../data/uniprot_trembl_entries.tsv",
    separator="\t",
    new_columns=[
        "organism",
        "organism_id",
        "lineage",
        "type",
        "reviewed",
        "unreviewed",
        "has_structure",
    ],
    has_header=False,
)
df.sink_parquet("../data/uniprot_trembl_entries.parquet")


More information on the new streaming engine: https://github.com/pola-rs/polars/issues/20947


CPU times: user 59.5 s, sys: 19.6 s, total: 1min 19s
Wall time: 1min 23s


In [3]:
df.head().collect()

NameError: name 'df' is not defined

# Read in parquet

In [5]:
uniprot_trembl = pl.scan_parquet("../data/uniprot_trembl_entries.parquet")
uniprot_trembl

## Assign kingdoms, domains, summarize per organism


In [6]:
def merge_microbes(species_type):
    if species_type in {"Bacteria", "Archaea", "Viruses"}:
        return "Microbial"
    else:
        return species_type.title()


def assign_superdomain(species_type):
    if species_type == "Viruses":
        return "Non-cellular Life"
    else:
        return "Cellular Life"


def assign_domain(species_type):
    if species_type in {"Bacteria", "Archaea", "Viruses"}:
        return species_type
    else:
        return "Eukaryota"


def assign_kingdom(species_type):
    if species_type in {"Bacteria", "Archaea"}:
        return "Monera"
    else:
        return species_type


uniprot_trembl = uniprot_trembl.with_columns(
    pl.col("type")
    .map_elements(merge_microbes, return_dtype=pl.String)
    .alias("type_merge_microbes"),
    pl.col("type")
    .map_elements(assign_superdomain, return_dtype=pl.String)
    .alias("superdomain"),
    pl.col("type").map_elements(assign_domain, return_dtype=pl.String).alias("domain"),
    pl.col("type")
    .map_elements(assign_kingdom, return_dtype=pl.String)
    .alias("kingdom"),
)
uniprot_trembl.head().collect()

organism,organism_id,lineage,type,reviewed,unreviewed,has_structure,type_merge_microbes,superdomain,domain,kingdom
str,i64,str,str,bool,bool,bool,str,str,str,str
"""Pseudomonas rhodesiae""",76760,"""Bacteria; Pseudomonadota; Gamm…","""Bacteria""",False,True,False,"""Microbial""","""Cellular Life""","""Bacteria""","""Monera"""
"""Gossypium darwinii""",34276,"""Eukaryota; Viridiplantae; Stre…","""Plant""",False,True,False,"""Plant""","""Cellular Life""","""Eukaryota""","""Plant"""
"""Marmota monax""",9995,"""Eukaryota; Metazoa; Chordata; …","""Animal""",False,True,False,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Neomarinimicrobiota bacterium""",2026760,"""Bacteria; Candidatus Neomarini…","""Bacteria""",False,True,False,"""Microbial""","""Cellular Life""","""Bacteria""","""Monera"""
"""Clostridium perfringens""",1502,"""Bacteria; Bacillati; Bacillota…","""Bacteria""",False,True,False,"""Microbial""","""Cellular Life""","""Bacteria""","""Monera"""


In [7]:
%%time


def merge_microbes(species_type):
    if species_type in {"Bacteria", "Archaea", "Viruses"}:
        return "Microbial"
    else:
        return species_type.title()


def assign_superdomain(species_type):
    if species_type == "Viruses":
        return "Non-cellular Life"
    else:
        return "Cellular Life"


def assign_domain(species_type):
    if species_type in {"Bacteria", "Archaea", "Viruses"}:
        return species_type
    else:
        return "Eukaryota"


def assign_kingdom(species_type):
    if species_type in {"Bacteria", "Archaea"}:
        return "Monera"
    else:
        return species_type


uniprot_trembl_summarized_per_organism = (
    uniprot_trembl.group_by(["organism", "organism_id", "lineage", "type"])
    .agg(
        [
            pl.col("reviewed").sum().alias("reviewed_count"),
            pl.col("unreviewed").sum().alias("unreviewed_count"),
            pl.col("has_structure")
            .sum()
            .alias("pdb_structures_count"),  # Renamed to be explicit
        ]
    )
    .with_columns(
        pl.col("type")
        .map_elements(merge_microbes, return_dtype=pl.String)
        .alias("type_merge_microbes"),
        pl.col("type")
        .map_elements(assign_superdomain, return_dtype=pl.String)
        .alias("superdomain"),
        pl.col("type")
        .map_elements(assign_domain, return_dtype=pl.String)
        .alias("domain"),
        pl.col("type")
        .map_elements(assign_kingdom, return_dtype=pl.String)
        .alias("kingdom"),
    )
    .collect()
)
print(uniprot_trembl_summarized_per_organism.shape)
uniprot_trembl_summarized_per_organism.sort("reviewed_count", descending=True)

(1318326, 11)
CPU times: user 35.8 s, sys: 2min 16s, total: 2min 52s
Wall time: 30.3 s


organism,organism_id,lineage,type,reviewed_count,unreviewed_count,pdb_structures_count,type_merge_microbes,superdomain,domain,kingdom
str,i64,str,str,u32,u32,u32,str,str,str,str
"""Melanophthalma sp. 1 NPL-2007""",485279,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,3,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Crambidae sp. LA2010_208""",2031700,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,1,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Lactarius sp. HL-2018m""",2364847,"""Eukaryota; Fungi; Dikarya; Bas…","""Fungi""",0,2,0,"""Fungi""","""Cellular Life""","""Eukaryota""","""Fungi"""
"""Influenza A virus""",1503424,"""Viruses; Riboviria; Orthornavi…","""Viruses""",0,4,0,"""Microbial""","""Non-cellular Life""","""Viruses""","""Viruses"""
"""Cecidomyiidae sp. BOLD:ADV5844""",2722153,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,1,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
…,…,…,…,…,…,…,…,…,…,…
"""Mylabris suturalis""",2506990,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,5,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Influenza A virus""",747955,"""Viruses; Riboviria; Orthornavi…","""Viruses""",0,10,0,"""Microbial""","""Non-cellular Life""","""Viruses""","""Viruses"""
"""Sapromyza bentejui""",189933,"""Eukaryota; Metazoa; Ecdysozoa;…","""Animal""",0,1,0,"""Animal""","""Cellular Life""","""Eukaryota""","""Animal"""
"""Influenza A virus""",1598898,"""Viruses; Riboviria; Orthornavi…","""Viruses""",0,1,0,"""Microbial""","""Non-cellular Life""","""Viruses""","""Viruses"""


### Curious: How many unreviewed have PDB structures?

In [8]:
uniprot_trembl_summarized_per_organism["pdb_structures_count"].sum()

30103

### Sanity check: this should be ~250M sequences

In [9]:
uniprot_trembl_summarized_per_organism["unreviewed_count"].sum()

252633201

In [10]:
assert uniprot_trembl_summarized_per_organism["unreviewed_count"].sum() == 252633201

### Write to file!

In [11]:
uniprot_trembl_summarized_per_organism.write_parquet(
    "../data/uniprot_trembl_taxonomy_summary.parquet"
)