In [1]:
from io import StringIO

import numpy as np
import pandas as pd
import polars as pl

## Import Mora et al data

In [2]:
mora_et_al_2011 = pd.read_csv(
    StringIO(
        """Species,Earth_Catalogued,Earth_Predicted,Earth_SE,Ocean_Catalogued,Ocean_Predicted,Ocean_SE
Animalia,953434,7770000,958000,171082,2150000,145000
Chromista,13033,27500,30500,4859,7400,9640
Fungi,43271,611000,297000,1097,5320,11100
Plantae,215644,298000,8200,8600,16600,9130
Protozoa,8118,36400,6690,8118,36400,6690
Eukaryotes_Total,1233500,8740000,1300000,193756,2210000,182000
Archaea,502,455,160,1,1,0
Bacteria,10358,9680,3470,652,1320,436
Prokaryotes_Total,10860,10100,3630,653,1320,436
Grand_Total,1244360,8750000,1300000,194409,2210000,182000"""
    ),
    # index_col=0,
)
mora_et_al_2011

Unnamed: 0,Species,Earth_Catalogued,Earth_Predicted,Earth_SE,Ocean_Catalogued,Ocean_Predicted,Ocean_SE
0,Animalia,953434,7770000,958000,171082,2150000,145000
1,Chromista,13033,27500,30500,4859,7400,9640
2,Fungi,43271,611000,297000,1097,5320,11100
3,Plantae,215644,298000,8200,8600,16600,9130
4,Protozoa,8118,36400,6690,8118,36400,6690
5,Eukaryotes_Total,1233500,8740000,1300000,193756,2210000,182000
6,Archaea,502,455,160,1,1,0
7,Bacteria,10358,9680,3470,652,1320,436
8,Prokaryotes_Total,10860,10100,3630,653,1320,436
9,Grand_Total,1244360,8750000,1300000,194409,2210000,182000


In [3]:
def merge_mora_eukaryotes(x):
    if x in {"Animalia", "Fungi", "Plantae"}:
        if x == "Animalia":
            return "Animal"
        elif x == "Plantae":
            return "Plant"
        else:
            return x
    elif x in {"Chromista", "Protozoa"}:
        return "Other Eukaryota"


mora_et_al_2011["eukaryote_species"] = mora_et_al_2011["Species"].map(
    merge_mora_eukaryotes
)
mora_et_al_2011

Unnamed: 0,Species,Earth_Catalogued,Earth_Predicted,Earth_SE,Ocean_Catalogued,Ocean_Predicted,Ocean_SE,eukaryote_species
0,Animalia,953434,7770000,958000,171082,2150000,145000,Animal
1,Chromista,13033,27500,30500,4859,7400,9640,Other Eukaryota
2,Fungi,43271,611000,297000,1097,5320,11100,Fungi
3,Plantae,215644,298000,8200,8600,16600,9130,Plant
4,Protozoa,8118,36400,6690,8118,36400,6690,Other Eukaryota
5,Eukaryotes_Total,1233500,8740000,1300000,193756,2210000,182000,
6,Archaea,502,455,160,1,1,0,
7,Bacteria,10358,9680,3470,652,1320,436,
8,Prokaryotes_Total,10860,10100,3630,653,1320,436,
9,Grand_Total,1244360,8750000,1300000,194409,2210000,182000,


In [4]:
earth_predicted_eukaryotes = mora_et_al_2011.groupby("eukaryote_species")[
    "Earth_Predicted"
].sum()
earth_predicted_eukaryotes

eukaryote_species
Animal             7770000
Fungi               611000
Other Eukaryota      63900
Plant               298000
Name: Earth_Predicted, dtype: int64

In [5]:
ocean_predicted_eukaryotes = mora_et_al_2011.groupby("eukaryote_species")[
    "Ocean_Predicted"
].sum()
ocean_predicted_eukaryotes

eukaryote_species
Animal             2150000
Fungi                 5320
Other Eukaryota      43800
Plant                16600
Name: Ocean_Predicted, dtype: int64

In [6]:
earth_catalogued_eukaryotes = mora_et_al_2011.groupby("eukaryote_species")[
    "Earth_Predicted"
].sum()
earth_catalogued_eukaryotes

eukaryote_species
Animal             7770000
Fungi               611000
Other Eukaryota      63900
Plant               298000
Name: Earth_Predicted, dtype: int64

In [7]:
ocean_catalogued_eukaryotes = mora_et_al_2011.groupby("eukaryote_species")[
    "Ocean_Catalogued"
].sum()
ocean_catalogued_eukaryotes

eukaryote_species
Animal             171082
Fungi                1097
Other Eukaryota     12977
Plant                8600
Name: Ocean_Catalogued, dtype: int64

In [8]:
# Prediction of number of proteins per gene
n_proteins_multiplier = {
    "Microbial": 5000,
    # "Typical" bacterial genome is 5000 genes: https://pmc.ncbi.nlm.nih.gov/articles/PMC4361730/
    "Bacteria": 5000,
    # Using same number from bacteria for simplicity
    "Archea": 5000,
    # Animal predicted ~20,000 from personal intiution: Humans have 20k, Mouse have ~15k, and bivalves like oysters have 30k
    # Also, this 2013 paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC3737309/
    "Animal": 20000,
    # Fungal predicted ~ 11,000 genes/genome from https://pmc.ncbi.nlm.nih.gov/articles/PMC6078396/
    "Fungi": 11000,
    "Other Eukaryota": 10000,
    # Plant average of 32,000 genes/genome from https://academic.oup.com/bfg/article/13/4/308/2845968?login=false
    "Plant": 32000,
}

## Add earth plus ocean for predicted

In [9]:
planet_predicted_eukaryotes = earth_predicted_eukaryotes + ocean_predicted_eukaryotes
planet_predicted_eukaryotes.name = "n_species"
# Add 1 billion microbes
planet_predicted_eukaryotes["Microbial"] = 1e12
planet_predicted_eukaryotes

eukaryote_species
Animal             9.920000e+06
Fungi              6.163200e+05
Other Eukaryota    1.077000e+05
Plant              3.146000e+05
Microbial          1.000000e+12
Name: n_species, dtype: float64

In [10]:
df = pl.DataFrame({"key": [1, 2, 3], "value": [4, 5, 6]})
df

key,value
i64,i64
1,4
2,5
3,6


In [11]:
df.select(pl.all().repeat_by(2).flatten())

key,value
i64,i64
1,4
1,4
2,5
2,5
3,6
3,6


## Map domains to kingdoms

In [12]:
domain_to_kingdom = pd.DataFrame(
    {
        "domain": [
            "Archea",
            "Bacteria",
            "Microbial",
            "Eukaryota",
            "Eukaryota",
            "Eukaryota",
            "Eukaryota",
        ],
        "kingdom": [
            "Archea",
            "Bacteria",
            "Microbial",
            "Animal",
            "Fungi",
            "Plant",
            "Other Eukaryota",
        ],
    }
)
domain_to_kingdom

Unnamed: 0,domain,kingdom
0,Archea,Archea
1,Bacteria,Bacteria
2,Microbial,Microbial
3,Eukaryota,Animal
4,Eukaryota,Fungi
5,Eukaryota,Plant
6,Eukaryota,Other Eukaryota


In [13]:
domain_to_kingdom["n_proteins"] = domain_to_kingdom.kingdom.map(n_proteins_multiplier)
domain_to_kingdom["n_species"] = domain_to_kingdom.kingdom.map(
    planet_predicted_eukaryotes
)

domain_to_kingdom

Unnamed: 0,domain,kingdom,n_proteins,n_species
0,Archea,Archea,5000,
1,Bacteria,Bacteria,5000,
2,Microbial,Microbial,5000,1000000000000.0
3,Eukaryota,Animal,20000,9920000.0
4,Eukaryota,Fungi,11000,616320.0
5,Eukaryota,Plant,32000,314600.0
6,Eukaryota,Other Eukaryota,10000,107700.0


In [14]:
planet_predicted_eukaryotes

eukaryote_species
Animal             9.920000e+06
Fungi              6.163200e+05
Other Eukaryota    1.077000e+05
Plant              3.146000e+05
Microbial          1.000000e+12
Name: n_species, dtype: float64

In [15]:
for (domain, kingdom), df in domain_to_kingdom.groupby(["domain", "kingdom"]):
    if domain == "Eukaryota":
        break

In [16]:
df

Unnamed: 0,domain,kingdom,n_proteins,n_species
3,Eukaryota,Animal,20000,9920000.0


In [17]:
this_domain_kingdom = pl.DataFrame(df)
this_domain_kingdom

domain,kingdom,n_proteins,n_species
str,str,i64,f64
"""Eukaryota""","""Animal""",20000,9920000.0


In [18]:
kingdom = df.kingdom.values[0]
kingdom

'Animal'

In [19]:
this_domain_kingdom.select(
    pl.all().repeat_by(9.92e6).flatten()  # .cast(pl.UInt64)
).with_row_index().with_columns(pl.col("n_species").cast(pl.UInt64)).with_columns(
    pl.col("index")
    .cast(pl.String)
    .str.zfill(12)
    .str.replace("^", f"{kingdom}_")
    .alias("organism_number")
    # pl.concat_str([kingdom, pl.col("index")]).alias("organism_number")
)

index,domain,kingdom,n_proteins,n_species,organism_number
u32,str,str,i64,u64,str
0,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_000000000000"""
1,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_000000000001"""
2,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_000000000002"""
3,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_000000000003"""
4,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_000000000004"""
…,…,…,…,…,…
9919995,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_000009919995"""
9919996,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_000009919996"""
9919997,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_000009919997"""
9919998,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_000009919998"""


In [20]:
domain_to_kingdom_merge_archea_prokaryota = domain_to_kingdom.query(
    'domain != "Archea" and domain != "Bacteria"'
)
domain_to_kingdom_merge_archea_prokaryota

Unnamed: 0,domain,kingdom,n_proteins,n_species
2,Microbial,Microbial,5000,1000000000000.0
3,Eukaryota,Animal,20000,9920000.0
4,Eukaryota,Fungi,11000,616320.0
5,Eukaryota,Plant,32000,314600.0
6,Eukaryota,Other Eukaryota,10000,107700.0


In [21]:
dfs = []

for (domain, kingdom), df in domain_to_kingdom_merge_archea_prokaryota.groupby(
    ["domain", "kingdom"]
):
    this_domain_kingdom = pl.DataFrame(df)
    kingdom = df.kingdom.values[0]

    n_species = df.n_species.values[0]

    print(f"Domain: {domain}, Kingdom: {kingdom}, Number of species: {n_species:,}")
    df_predicted_n_proteins = (
        this_domain_kingdom.select(
            pl.all().repeat_by(int(n_species)).flatten()  # .cast(pl.UInt64)
        )
        .with_row_index()
        .with_columns(pl.col("n_species").cast(pl.UInt64))
        .with_columns(
            pl.col("index")
            .cast(pl.String)
            .str.zfill(13)
            .str.replace("^", f"{kingdom}_")
            .alias("organism_number")
            # pl.concat_str([kingdom, pl.col("index")]).alias("organism_number")
        )
    )
    dfs.append(df_predicted_n_proteins)
predicted_n_proteins = pl.concat(dfs)

Domain: Eukaryota, Kingdom: Animal, Number of species: 9,920,000.0
Domain: Eukaryota, Kingdom: Fungi, Number of species: 616,320.0
Domain: Eukaryota, Kingdom: Other Eukaryota, Number of species: 107,700.0
Domain: Eukaryota, Kingdom: Plant, Number of species: 314,600.0
Domain: Microbial, Kingdom: Microbial, Number of species: 1,000,000,000,000.0


In [22]:
this_domain_kingdom

domain,kingdom,n_proteins,n_species
str,str,i64,f64
"""Microbial""","""Microbial""",5000,1000000000000.0


In [23]:
1000000000000 / 4294967295

232.83064370807975

In [24]:
n_species

np.float64(1000000000000.0)

In [25]:
n_species

np.float64(1000000000000.0)

In [26]:
this_domain_kingdom.select(pl.all().repeat_by(100).flatten())  # .cast(pl.UInt64)

domain,kingdom,n_proteins,n_species
str,str,i64,f64
"""Microbial""","""Microbial""",5000,1.0000e12
"""Microbial""","""Microbial""",5000,1.0000e12
"""Microbial""","""Microbial""",5000,1.0000e12
"""Microbial""","""Microbial""",5000,1.0000e12
"""Microbial""","""Microbial""",5000,1.0000e12
…,…,…,…
"""Microbial""","""Microbial""",5000,1.0000e12
"""Microbial""","""Microbial""",5000,1.0000e12
"""Microbial""","""Microbial""",5000,1.0000e12
"""Microbial""","""Microbial""",5000,1.0000e12


In [27]:
predicted_n_proteins

index,domain,kingdom,n_proteins,n_species,organism_number
u32,str,str,i64,u64,str
0,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_0000000000000"""
1,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_0000000000001"""
2,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_0000000000002"""
3,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_0000000000003"""
4,"""Eukaryota""","""Animal""",20000,9920000,"""Animal_0000000000004"""
…,…,…,…,…,…
314596,"""Eukaryota""","""Plant""",32000,314600,"""Plant_0000000314596"""
314597,"""Eukaryota""","""Plant""",32000,314600,"""Plant_0000000314597"""
314598,"""Eukaryota""","""Plant""",32000,314600,"""Plant_0000000314598"""
314599,"""Eukaryota""","""Plant""",32000,314600,"""Plant_0000000314599"""


In [28]:
predicted_n_proteins.write_parquet("../data/predicted_n_proteins.parquet")

# -> This is what is used in the blog post: Use species numbers from Larsen 2017

## Projected number of species from Larsen 2017

Using data from Table 1, Scenario 1 (Parasites with intermediate parasite richness) from:

    Larsen, B. B., Miller, E. C., Rhodes, M. K. & Wiens, J. J. Inordinate Fondness Multiplied and Redistributed: the Number of Species on Earth and the New Pie of Life. Q. Rev. Biol. 92, 229–265 (2017).


- Animals includes cryptic species for arthropods and 1 apicomplexans (e.g. Malaria parasite) per arthropod


In [29]:
250e6 / 500e3

500.0

In [30]:
(500 * 48) / 60 / 60

6.666666666666667

##### 

In [31]:
larsen2017 = pd.Series(
    {
        "Animals": 163.2e6,
        "Plants": 0.340e6,
        "Fungi": 165.6e6,
        "Protists": 163.2e6,
        "Bacteria": 1.746e9,
    },
    name="n_species",
)

larsen2017.sum()

np.float64(2238340000.0)

## Let's say that 1% of "Bacteria" are Archea because that's the current distribution in NCBI genomes

https://www.ncbi.nlm.nih.gov/datasets/genome/

- 2.54M Bacterial genomes
- 29.05k Archeal genomes

In [32]:
29.05e3 / (2.54e6 + 29.05e3)

0.011307681827913042

In [33]:
larsen2017_archea_bacteria = larsen2017.copy()
larsen2017_archea_bacteria["Archaea"] = 0.01 * larsen2017["Bacteria"]
larsen2017_archea_bacteria["Bacteria"] = 0.99 * larsen2017["Bacteria"]
larsen2017_archea_bacteria

Animals     1.632000e+08
Plants      3.400000e+05
Fungi       1.656000e+08
Protists    1.632000e+08
Bacteria    1.728540e+09
Archaea     1.746000e+07
Name: n_species, dtype: float64

In [34]:
larsen2017_domain_to_kingdom = pd.DataFrame(
    {
        "domain": [
            "Eukaryota",
            "Eukaryota",
            "Eukaryota",
            "Eukaryota",
            "Bacteria",
            "Archaea",
        ],
        "kingdom": ["Animals", "Plants", "Fungi", "Protists", "Bacteria", "Archaea"],
    }
)
larsen2017_domain_to_kingdom = larsen2017_domain_to_kingdom.join(
    larsen2017_archea_bacteria, on="kingdom"
)
larsen2017_domain_to_kingdom

Unnamed: 0,domain,kingdom,n_species
0,Eukaryota,Animals,163200000.0
1,Eukaryota,Plants,340000.0
2,Eukaryota,Fungi,165600000.0
3,Eukaryota,Protists,163200000.0
4,Bacteria,Bacteria,1728540000.0
5,Archaea,Archaea,17460000.0


## Add number of genes per group

In [35]:
# Prediction of number of proteins per gene
n_proteins_multiplier_larsen2017 = pd.Series(
    {
        # "Typical" bacterial genome is 5000 genes: https://pmc.ncbi.nlm.nih.gov/articles/PMC4361730/
        "Bacteria": 5000,
        # "Typical" bacterial genome is 5000 genes: https://pmc.ncbi.nlm.nih.gov/articles/PMC4361730/
        "Archaea": 5000,
        # Animal predicted ~20,000 from personal intiution: Humans have 20k, Mouse have ~15k, and bivalves like oysters have 30k
        # Also, this 2013 paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC3737309/
        # Decreased from 20k -> 15k since Larsen 2017 predicts majoriy of animal species to be arthropods, which have 10-20k genes generally
        "Animals": 15000,
        # Fungal predicted ~ 11,000 genes/genome from https://pmc.ncbi.nlm.nih.gov/articles/PMC6078396/
        "Fungi": 11000,
        # Plant average of 32,000 genes/genome from https://academic.oup.com/bfg/article/13/4/308/2845968?login=false
        "Plants": 32000,
        # https://ngdc.cncb.ac.cn/p10k/browse/genome
        # Protist 10,000 Genomes Project. The Innovation, 2020, 1(3). (PMID: 34557722)
        # The P10K Database: A Data Portal for the Protist 10,000 Genomes Project (In Preparation)
        # Looked at the high-quality anotations from here and made a guess
        "Protists": 7500,
    },
    name="n_genes",
)
n_proteins_multiplier_larsen2017

Bacteria     5000
Archaea      5000
Animals     15000
Fungi       11000
Plants      32000
Protists     7500
Name: n_genes, dtype: int64

In [36]:
larsen2017_domain_to_kingdom_with_genes = larsen2017_domain_to_kingdom.join(
    n_proteins_multiplier_larsen2017, on="kingdom"
)
larsen2017_domain_to_kingdom_with_genes

Unnamed: 0,domain,kingdom,n_species,n_genes
0,Eukaryota,Animals,163200000.0,15000
1,Eukaryota,Plants,340000.0,32000
2,Eukaryota,Fungi,165600000.0,11000
3,Eukaryota,Protists,163200000.0,7500
4,Bacteria,Bacteria,1728540000.0,5000
5,Archaea,Archaea,17460000.0,5000


## Lets do per 100k of species to reduce the compute complexity

Divide by 100k, then take the ceiling (round up), and convert to integer so we can make the circle visualization

In [37]:
larsen2017_domain_to_kingdom_with_genes["n_ten_thousand_species"] = np.ceil(
    larsen2017_domain_to_kingdom_with_genes.n_species / 1e4
).astype(int)
larsen2017_domain_to_kingdom_with_genes["n_genes_per_ten_thousand"] = (
    larsen2017_domain_to_kingdom_with_genes.n_genes * 1e4
).astype(int)


larsen2017_domain_to_kingdom_with_genes

Unnamed: 0,domain,kingdom,n_species,n_genes,n_ten_thousand_species,n_genes_per_ten_thousand
0,Eukaryota,Animals,163200000.0,15000,16320,150000000
1,Eukaryota,Plants,340000.0,32000,34,320000000
2,Eukaryota,Fungi,165600000.0,11000,16560,110000000
3,Eukaryota,Protists,163200000.0,7500,16320,75000000
4,Bacteria,Bacteria,1728540000.0,5000,172854,50000000
5,Archaea,Archaea,17460000.0,5000,1746,50000000


## Now lets create the table where each row is one of 1 million species

In [38]:
dfs = []


def create_species_per_row_table(df, n_species_col="n_ten_thousand_species"):
    this_domain_kingdom = pl.DataFrame(df)
    kingdom = df.kingdom.values[0]

    n_species = df[n_species_col].values[0]

    print(f"Domain: {domain}, Kingdom: {kingdom}, {n_species_col}: {n_species:,}")
    df_predicted_n_proteins = (
        this_domain_kingdom.select(
            pl.all().repeat_by(int(n_species)).flatten()  # .cast(pl.UInt64)
        )
        .with_row_index()
        .with_columns(pl.col(n_species_col).cast(pl.UInt64))
        .with_columns(
            pl.col("index")
            .cast(pl.String)
            .str.zfill(10)
            .str.replace("^", f"10k_{kingdom}_")
            .alias("organism_number")
            # pl.concat_str([kingdom, pl.col("index")]).alias("organism_number")
        )
    )
    return df_predicted_n_proteins


for (domain, kingdom), df in larsen2017_domain_to_kingdom_with_genes.groupby(
    ["domain", "kingdom"]
):
    this_domain_kingdom = pl.DataFrame(df)
    kingdom = df.kingdom.values[0]

    n_species = df.n_ten_thousand_species.values[0]

    print(f"Domain: {domain}, Kingdom: {kingdom}, Thousands of species: {n_species:,}")
    df_predicted_n_proteins = (
        this_domain_kingdom.select(
            pl.all().repeat_by(int(n_species)).flatten()  # .cast(pl.UInt64)
        )
        .with_row_index()
        .with_columns(pl.col("n_ten_thousand_species").cast(pl.UInt64))
        .with_columns(
            pl.col("index")
            .cast(pl.String)
            .str.zfill(10)
            .str.replace("^", f"10k_{kingdom}_")
            .alias("organism_number")
            # pl.concat_str([kingdom, pl.col("index")]).alias("organism_number")
        )
    )
    dfs.append(df_predicted_n_proteins)
larsen2017_predicted_n_proteins = pl.concat(dfs)
larsen2017_predicted_n_proteins

Domain: Archaea, Kingdom: Archaea, Thousands of species: 1,746
Domain: Bacteria, Kingdom: Bacteria, Thousands of species: 172,854
Domain: Eukaryota, Kingdom: Animals, Thousands of species: 16,320
Domain: Eukaryota, Kingdom: Fungi, Thousands of species: 16,560
Domain: Eukaryota, Kingdom: Plants, Thousands of species: 34
Domain: Eukaryota, Kingdom: Protists, Thousands of species: 16,320


index,domain,kingdom,n_species,n_genes,n_ten_thousand_species,n_genes_per_ten_thousand,organism_number
u32,str,str,f64,i64,u64,i64,str
0,"""Archaea""","""Archaea""",1.746e7,5000,1746,50000000,"""10k_Archaea_0000000000"""
1,"""Archaea""","""Archaea""",1.746e7,5000,1746,50000000,"""10k_Archaea_0000000001"""
2,"""Archaea""","""Archaea""",1.746e7,5000,1746,50000000,"""10k_Archaea_0000000002"""
3,"""Archaea""","""Archaea""",1.746e7,5000,1746,50000000,"""10k_Archaea_0000000003"""
4,"""Archaea""","""Archaea""",1.746e7,5000,1746,50000000,"""10k_Archaea_0000000004"""
…,…,…,…,…,…,…,…
16315,"""Eukaryota""","""Protists""",1.632e8,7500,16320,75000000,"""10k_Protists_0000016315"""
16316,"""Eukaryota""","""Protists""",1.632e8,7500,16320,75000000,"""10k_Protists_0000016316"""
16317,"""Eukaryota""","""Protists""",1.632e8,7500,16320,75000000,"""10k_Protists_0000016317"""
16318,"""Eukaryota""","""Protists""",1.632e8,7500,16320,75000000,"""10k_Protists_0000016318"""


In [39]:
larsen2017_predicted_n_proteins.columns

['index',
 'domain',
 'kingdom',
 'n_species',
 'n_genes',
 'n_ten_thousand_species',
 'n_genes_per_ten_thousand',
 'organism_number']

In [40]:
f"{larsen2017_predicted_n_proteins['n_genes_per_ten_thousand'].sum():,}"

'14,234,480,000,000'

In [41]:
larsen2017_predicted_n_proteins
larsen2017_predicted_n_proteins.write_parquet(
    "../data/predicted_n_proteins_larsen2017.parquet"
)