In [1]:
import os

import pandas as pd

In [2]:
from magneton.io.internal import parse_from_dir, parse_from_pkl

Goal here is to construct the final SwissProt dataset and understand why we lose various proteins along the way.

## Summary
- Start from 572,619 SwissProt proteins exported from UniProtKB on 2025/01/22
- Keep 564,095 / 572,619 proteins that could be in AlphaFoldDB
  - AlphaFoldDB used the 2021_04 release of SwissProt, so remove proteins added to SwissProt after then
  - Furthermore, some proteins have had their sequence updated since the 2021_04 release, so remove those as well (structure doesn't match current sequence)
- Keep 548,484 / 564,095 proteins that are present in InterPro (not entirely clear why some are excluded from InterPro)
- Of those, 530,601 / 548,484 are also in AlphaFoldDB, likely excluded due to other outlined reasons
  - Overview of missingness reasons is given under "Which proteins are included?" [here](https://alphafold.ebi.ac.uk/faq)
- This is the final set of proteins we use.

## Context
When trying to add in secondary structure annotations to the `Protein` objects parsed out of InterPro, I was running into cases where there was no AlphaFold struture for a certain protein, since I was trying to parse the secondary structure annotations from the AlphaFoldDB mmCIF files.

## Methods
Downloaded old SwissProt release (2021_04) which was used for AlphaFoldDB (located at `/home/rcalef/storage/om_storage/data/uniprot/swissprot_2021_04`), checked how many of the missing UniProt IDs were also missing from there (i.e. added to SwissProt after that cutoff), then manually inspected the UniProt pages for a few of the remainder, which seemed to match the other AlphaFoldDB exclusion criteria (e.g. containing non-standard `X` amino acids).

In [3]:
data_dir = "/weka/scratch/weka/kellislab/rcalef/data/"

In [4]:
swissprot_summary = (
    pd.read_table(os.path.join(
        data_dir,
        "uniprot",
        "uniprotkb_AND_reviewed_true_2025_01_22.tsv.gz",
    ))
    .rename(columns={
        "Entry": "uniprot_id",
        "Date of last sequence modification": "last_seq_update",
    })
    .assign(
        last_seq_update=lambda x: pd.to_datetime(x["last_seq_update"]),
    )
)
print(len(swissprot_summary))
swissprot_summary.head()

572619


Unnamed: 0,uniprot_id,Entry Name,Gene Names,Organism,Length,Annotation,last_seq_update
0,A0A009IHW8,ABTIR_ACIB9,J512_3302,Acinetobacter baumannii (strain 1295743),269,5.0,2014-06-11
1,A0A023I7E1,ENG1_RHIMI,ENG1 LAM81A,Rhizomucor miehei,796,5.0,2014-07-09
2,A0A024B7W1,POLG_ZIKVF,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423,5.0,2014-07-09
3,A0A024SC78,CUTI1_HYPJR,M419DRAFT_76732,Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,248,5.0,2014-07-09
4,A0A024SH76,GUX2_HYPJR,cbh2 M419DRAFT_122470,Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,471,5.0,2014-07-09


In [5]:
old_swissprot = (
    pd.read_table(
        "/weka/scratch/weka/kellislab/rcalef/data/uniprot/swissprot_2021_04/uniprot_sprot.fasta.gz.fai",
        names=["uniprot_id", "x1", "x2", "x3", "x4"],
    )
    .assign(
        uniprot_id=lambda x: x.uniprot_id.str.split("|").str[1],
    )
)
old_swissprot.head()

Unnamed: 0,uniprot_id,x1,x2,x3,x4
0,Q6GZX4,256,122,60,61
1,Q6GZX3,320,499,60,61
2,Q197F8,458,943,60,61
3,Q197F7,156,1527,60,61
4,Q6GZX2,438,1800,60,61


In [6]:
want_swissprot = (
    swissprot_summary
    # In the version of SwissProt used by AlphaFoldDB
    .loc[lambda x: x.uniprot_id.isin(old_swissprot.uniprot_id)]
    # Sequence hasn't changed since AlphaFoldDB was created
    .loc[lambda x: x.last_seq_update < pd.to_datetime("2021-05")]
)
print(len(want_swissprot))
want_swissprot.head()

564095


Unnamed: 0,uniprot_id,Entry Name,Gene Names,Organism,Length,Annotation,last_seq_update
0,A0A009IHW8,ABTIR_ACIB9,J512_3302,Acinetobacter baumannii (strain 1295743),269,5.0,2014-06-11
2,A0A024B7W1,POLG_ZIKVF,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423,5.0,2014-07-09
4,A0A024SH76,GUX2_HYPJR,cbh2 M419DRAFT_122470,Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,471,5.0,2014-07-09
5,A0A026W182,ORCO_OOCBI,Orco X777_12371,Ooceraea biroi (Clonal raider ant) (Cerapachys...,478,5.0,2014-07-09
6,A0A044RE18,BLI_ONCVO,Bli,Onchocerca volvulus,693,5.0,2014-07-09


In [7]:
(
    want_swissprot
    [["uniprot_id"]]
    .to_csv(os.path.join(
        data_dir,
        "uniprot",
        "swissprot_subset.tsv",
    ), index=False, header=False)
)

In [8]:
swissprot_pkl_path = "/weka/scratch/weka/kellislab/rcalef/data/interpro/103.0/swissprot/swissprot_prots.pkl.bz2"

interpro_swissprot_ids = pd.Series([x.uniprot_id for x in parse_from_pkl(swissprot_pkl_path, compression="bz2")])
len(interpro_swissprot_ids)

548484

In [9]:
missing = want_swissprot.loc[lambda x: ~x.uniprot_id.isin(interpro_swissprot_ids)]
len(missing)

15611

In [10]:
# Checked these manually, all are missing from InterPro
missing.head()

Unnamed: 0,uniprot_id,Entry Name,Gene Names,Organism,Length,Annotation,last_seq_update
61,A0A0A1I6E7,NDB4S_ANDCR,,Androctonus crassicauda (Arabian fat-tailed sc...,74,5.0,2015-02-04
62,A0A0A1I6N9,NDB4T_ANDCR,,Androctonus crassicauda (Arabian fat-tailed sc...,74,5.0,2015-02-04
204,A0A0N7CSQ4,TX41A_SCOMU,,Scolopendra mutilans (Chinese red-headed centi...,68,5.0,2019-04-10
241,A0A139GI49,KAWA_MICA8,kgpE OA58_19045,Microcystis aeruginosa (strain NIES-88 / KW-MA...,87,5.0,2016-06-08
251,A0A144LUY5,CTCN1_ECHES,,Echinus esculentus (Sea urchin),119,5.0,2016-06-08


In [11]:
cif_tmpl = "/weka/scratch/weka/kellislab/rcalef/data/cif_alphafolddb/AF-%s-F1-model_v4.cif.gz"

num = 0
num_found = 0
missing_from_afdb = []
for prot_id in interpro_swissprot_ids:
    num += 1
    if os.path.exists(cif_tmpl % prot_id):
        num_found += 1
    else:
        missing_from_afdb.append(prot_id)
print(f"found {num_found} / {num} ({len(missing_from_afdb)} missing)")

found 530601 / 548484 (17883 missing)


In [12]:
missing_from_afdb[:5]

['A0A023GS29', 'A0A024B7W1', 'A0A024F910', 'A0A061FLA2', 'A0A075TJ05']

In [14]:
final_set = [x.uniprot_id for x in parse_from_dir("/weka/scratch/weka/kellislab/rcalef/data/interpro/103.0/swissprot/sharded_swissprot/with_ss", prefix="swissprot.with_ss")]
len(final_set)

530601