# UniProtKB/Swiss-Prot

Data contains proteins sequences and textual annotations regarding function, domain structure, and post translational modifications.

In [20]:
# FASTA format (Swiss-Prot only)
!wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz

# UniProt flat file
!wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz


--2025-04-12 17:45:00--  ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
           => ‘uniprot_sprot.fasta.gz’
Resolving ftp.uniprot.org (ftp.uniprot.org)... 128.175.240.195
Connecting to ftp.uniprot.org (ftp.uniprot.org)|128.175.240.195|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/databases/uniprot/current_release/knowledgebase/complete ... done.
==> SIZE uniprot_sprot.fasta.gz ... 92924866
==> PASV ... done.    ==> RETR uniprot_sprot.fasta.gz ... done.
Length: 92924866 (89M) (unauthoritative)


2025-04-12 17:45:05 (19.1 MB/s) - ‘uniprot_sprot.fasta.gz’ saved [92924866]

--2025-04-12 17:45:05--  ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
           => ‘uniprot_sprot.dat.gz’
Resolving ftp.uniprot.org (ftp.uniprot.org)... 128.175.240.195
Connecting to ftp.uniprot.org (ftp.uniprot

In [None]:
!mkdir data
!mv uniprot_sprot.fasta.gz ./data/uniprot_sprot.fasta.gz
!mv uniprot_sprot.dat.gz uniprot_sprot.dat.gz

In [3]:
# !gunzip uniprot_sprot.fasta.gz
# !gunzip uniprot_sprot.dat.gz

gzip: uniprot_sprot.fasta.gz: No such file or directory


In [4]:
!pip install biopython



In [8]:
from Bio import SeqIO

fasta_file = "uniprot_sprot.fasta"
for i, record in enumerate(SeqIO.parse(fasta_file, "fasta")):
    if i == 5:
        break
    print(f"ID: {record.id}")
    print(f"Description: {record.description}")
    print(f"Sequence: {record.seq[:60]}...")  # print first 60 amino acids
    print("="*50)

ID: sp|Q6GZX4|001R_FRG3G
Description: sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1
Sequence: MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS...
ID: sp|Q6GZX3|002L_FRG3G
Description: sp|Q6GZX3|002L_FRG3G Uncharacterized protein 002L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-002L PE=4 SV=1
Sequence: MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCAR...
ID: sp|Q197F8|002R_IIV3
Description: sp|Q197F8|002R_IIV3 Uncharacterized protein 002R OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-002R PE=4 SV=1
Sequence: MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPEL...
ID: sp|Q197F7|003L_IIV3
Description: sp|Q197F7|003L_IIV3 Uncharacterized protein 003L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-003L PE=4 SV=1
Sequence: MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGAWFDTSLNARSLTTT...
ID: sp|Q6GZX2|003R_FRG3G
Description: sp|Q6GZX2|003R_FRG3G Uncharacterized protein 3

In [18]:
from Bio import SwissProt

dat_file = "uniprot_sprot.dat"  # or after gunzipping the .gz file

# Open and parse
with open(dat_file) as handle:
    count = 0
    for record in SwissProt.parse(handle):
        print(f"ID: {record.entry_name}")
        print(f"Accession(s): {record.accessions}")
        print(f"Protein Name: {record.description}")
        print(f"Organism: {record.organism}")
        print(f"Gene Name(s): {record.gene_name}")
        print(f"Function:\n{record.comments}")  # Full comment section includes FUNCTION
        print("=" * 60)
        print(record)

        count += 1
        if count == 5:  # Only show first 5
            break


ID: 001R_FRG3G
Accession(s): ['Q6GZX4']
Protein Name: RecName: Full=Putative transcription factor 001R;
Organism: Frog virus 3 (isolate Goorha) (FV-3).
Gene Name(s): [{'ORFNames': ['FV3-001R']}]
Function:
['FUNCTION: Transcription activation. {ECO:0000305}.']
<Bio.SwissProt.Record object at 0x15551f3f3770>
ID: 002L_FRG3G
Accession(s): ['Q6GZX3']
Protein Name: RecName: Full=Uncharacterized protein 002L;
Organism: Frog virus 3 (isolate Goorha) (FV-3).
Gene Name(s): [{'ORFNames': ['FV3-002L']}]
Function:
['SUBCELLULAR LOCATION: Host membrane {ECO:0000305}; Single-pass membrane protein {ECO:0000305}.']
<Bio.SwissProt.Record object at 0x15547f9979b0>
ID: 002R_IIV3
Accession(s): ['Q197F8']
Protein Name: RecName: Full=Uncharacterized protein 002R;
Organism: Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).
Gene Name(s): [{'ORFNames': ['IIV3-002R']}]
Function:
[]
<Bio.SwissProt.Record object at 0x15551f3f3770>
ID: 003L_IIV3
Accession(s): ['Q197F7']
Protein Name: RecName: Ful

In [22]:
!pip install diskcache

Collecting diskcache
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
Installing collected packages: diskcache
Successfully installed diskcache-5.6.3


In [5]:
from proteinclip.swissprot import SwissProtDataReader

spdr = SwissProtDataReader("./data/uniprot/uniprot_sprot.dat.gz")

Reading ./data/uniprot/uniprot_sprot.dat.gz: 0it [00:00, ?it/s]

In [7]:
spdr.unique_organisms()

['Paracoccus versutus (Thiobacillus versutus)',
 'Rhodopseudomonas sp. (strain No.7)',
 'Megangulus venulosus (Japanese bivalve) (Tellina venulosa)',
 'Cerebratulus lacteus (Milky ribbon worm) (Micrura lactea)',
 'Pseudomonas aeruginosa (strain BWHPSA011 / Pa011)',
 'Scelidodon sp. (strain SLP-2019) (South American ground sloth)',
 'Erythrina variegata (Indian coral tree) (Erythrina indica)',
 'Rosa foetida (Austrian briar) (Yellow Austrian rose)',
 'Sorex shinto sadonis (Sado shrew) (Sorex sadonis)',
 'Avian infectious bronchitis virus (strain Gray) (IBV)',
 'Hookeria lucens (Moss) (Hypnum lucens)',
 'Tritrichomonas foetus (Trichomonas foetus) (Tritrichomonas suis)',
 'Trachypithecus geei (Golden langur) (Golden leaf monkey)',
 'Aeromonas salmonicida (strain A449)',
 'Streptomyces alboniger',
 "Marshallia caespitosa (Barbara's buttons)",
 'HTCC2601) (Pelagibaca bermudensis)',
 'Maudiozyma exigua (Yeast) (Kazachstania exigua)',
 'Kingdom/A12Valle119/1932 serotype A) (FMDV)',
 'Pichia k

In [3]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading levenshtein-0.27.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein)
  Downloading rapidfuzz-3.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading levenshtein-0.27.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
Downloading rapidfuzz-3.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.27.1 rapidfuzz-3.13.0


In [14]:
from proteinclip import model_utils
import numpy as np

m = model_utils.load_proteinclip("esm", 12)  # For ESM2, 33-layer model

# Create a synthetic example
# Size corresponds to embedding dimension of "parent" protein language model
model_input = np.random.randn(480)
# ProteinCLIP expects input to be unit-normalized
model_input /= np.linalg.norm(model_input)
x = m.predict(model_input)
print(x.shape)  # (128,)
print(np.linalg.norm(x))  # 1.0; ProteinCLIP produces unit-norm vectors

(128,)
1.0


[1;31m2025-04-13 00:15:15.032197525 [E:onnxruntime:Default, env.cc:234 ThreadMain] pthread_setaffinity_np failed for thread: 1911567, index: 0, mask: {1, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.[m
[1;31m2025-04-13 00:15:15.032215015 [E:onnxruntime:Default, env.cc:234 ThreadMain] pthread_setaffinity_np failed for thread: 1911568, index: 1, mask: {2, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.[m
[1;31m2025-04-13 00:15:15.033193812 [E:onnxruntime:Default, env.cc:234 ThreadMain] pthread_setaffinity_np failed for thread: 1911610, index: 42, mask: {43, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.[m
[1;31m2025-04-13 00:15:15.033654224 [E:onnxruntime:Default, env.cc:234 ThreadMain] pthread_setaffinity_np failed for thread: 1911581, index: 14, mask: {15, }, error code: 22 e

[1;31m2025-04-13 00:15:15.236427232 [E:onnxruntime:Default, env.cc:234 ThreadMain] pthread_setaffinity_np failed for thread: 1911607, index: 40, mask: {41, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.[m
[1;31m2025-04-13 00:15:15.240423844 [E:onnxruntime:Default, env.cc:234 ThreadMain] pthread_setaffinity_np failed for thread: 1911608, index: 41, mask: {42, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.[m
[1;31m2025-04-13 00:15:15.244458018 [E:onnxruntime:Default, env.cc:234 ThreadMain] pthread_setaffinity_np failed for thread: 1911569, index: 2, mask: {3, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.[m
[1;31m2025-04-13 00:15:15.244429524 [E:onnxruntime:Default, env.cc:234 ThreadMain] pthread_setaffinity_np failed for thread: 1911628, index: 60, mask: {61, }, error code: 22

In [11]:
!pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.21.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.7 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting flatbuffers (from onnxruntime)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.21.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m152.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
Downloading flatbuffers-25.2.10-py2.py3-none-any.whl (30 kB)
Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
Installing collected packages: flatbuffers, humanfriendly, coloredlogs, onnxruntime
Successfu