# UniProtKB/Swiss-Prot

Data contains proteins sequences and textual annotations regarding function, domain structure, and post translational modifications.

In [None]:
# FASTA format (Swiss-Prot only)
!wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz

# UniProt flat file
!wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz


In [None]:
!mkdir data
!mv uniprot_sprot.fasta.gz ./data/uniprot_sprot.fasta.gz
!mv uniprot_sprot.dat.gz uniprot_sprot.dat.gz

In [None]:
# !gunzip uniprot_sprot.fasta.gz
# !gunzip uniprot_sprot.dat.gz

In [None]:
!pip install biopython

In [None]:
from Bio import SeqIO

fasta_file = "uniprot_sprot.fasta"
for i, record in enumerate(SeqIO.parse(fasta_file, "fasta")):
    if i == 5:
        break
    print(f"ID: {record.id}")
    print(f"Description: {record.description}")
    print(f"Sequence: {record.seq[:60]}...")  # print first 60 amino acids
    print("="*50)

In [None]:
from Bio import SwissProt

dat_file = "uniprot_sprot.dat"  # or after gunzipping the .gz file

# Open and parse
with open(dat_file) as handle:
    count = 0
    for record in SwissProt.parse(handle):
        print(f"ID: {record.entry_name}")
        print(f"Accession(s): {record.accessions}")
        print(f"Protein Name: {record.description}")
        print(f"Organism: {record.organism}")
        print(f"Gene Name(s): {record.gene_name}")
        print(f"Function:\n{record.comments}")  # Full comment section includes FUNCTION
        print("=" * 60)
        print(record)

        count += 1
        if count == 5:  # Only show first 5
            break


In [None]:
!pip install diskcache

# Train Test Split

In [1]:
from proteinclip.swissprot import SwissProtDataReader
spdr = SwissProtDataReader("./data/uniprot/uniprot_sprot.dat.gz")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Reading ./data/uniprot/uniprot_sprot.dat.gz: 0it [00:00, ?it/s]

In [12]:
from proteinclip.fasta_utils import read_fasta

fsdr = read_fasta("./data/uniprot/uniprot_sprot.fasta.gz")
fsdr = {k.split(" ")[0].split("|")[1]: v for k, v in fsdr.items()}

INFO:root:Read 572970 sequences from ./data/uniprot/uniprot_sprot.fasta.gz


In [13]:
fsdr

{'Q6GZX4': 'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL',
 'Q6GZX3': 'MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCARIKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSLAERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADCKCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNMLDDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRKVMFFVAGAVLVAILISTVRW',
 'Q197F8': 'MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPELQTSEPSEDYSGPVESLELLPLEIKLDIMQYLSWEQISWCKHPWLWTRWYKDNVVRVSAITFEDFQREYAFPEKIQEIHFTDTRAEEIKAILETTPNVTRLVIRRIDDMNYNTHGDLGLDDLEFLTHLMVEDACGFTDFWAPSLTHLTIKNLDMHPRWFGPVMDGIKSMQSTLKYLYIFETYGVNKPFVQWCTDNIETFYCTNSYRYENVPRPIYVWVLFQEDEWHGYRVEDNKFHRRYMYSTILHKRDTDWVENNPLKTPAQVEMYKFLLRISQLNRDGTGYESDSDPENEHFDDESFSSGEEDSSDEDDPTWAPDSDDSDW

In [5]:
len(spdr.unique_organisms())

14593

In [3]:
%load_ext autoreload 
%autoreload 2

In [4]:
from sklearn.model_selection import train_test_split
from proteinclip.swissprot import embed_function_descriptions
# Suppose you have data `X` (features) and `y` (labels)
X_train, X_test, y_train, y_test = train_test_split(
    list(spdr.keys()), list(spdr.values()), test_size=0.2, random_state=42
)


In [8]:
list(spdr.values())[0]

{'organism': 'Frog virus 3 (isolate Goorha) (FV-3)',
 'organism_id': '654924',
 'name': '',
 'evidence': 4,
 'function': 'Transcription activation.'}

In [21]:
length = len(fsdr["Q9Y662"])
print(length)
spdr.get('Q9Y662')

390


{'organism': 'Homo sapiens (Human)',
 'organism_id': '9606',
 'name': 'HS3ST3B1',
 'evidence': 1,
 'function': "Sulfotransferase that utilizes 3'-phospho-5'-adenylyl sulfate (PAPS) to catalyze the transfer of a sulfo group to an N-unsubstituted glucosamine linked to a 2-O-sulfo iduronic acid unit on heparan sulfate. Catalyzes the O-sulfation of glucosamine in IdoUA2S-GlcNS and also in IdoUA2S-GlcNH2. The substrate-specific O-sulfation generates an enzyme-modified heparan sulfate which acts as a binding receptor to Herpes simplex virus-1 (HSV-1) and permits its entry. Unlike HS3ST1/3-OST-1, does not convert non- anticoagulant heparan sulfate to anticoagulant heparan sulfate."}

In [25]:
len(spdr_keys)

15084

In [26]:
import numpy as np

spdr_keys = [k for k in spdr.keys() if spdr.get(k)["organism"] == "Homo sapiens (Human)" and len(spdr.get(k)["function"]) > 0 and len(fsdr[k]) <= 1024]
print(f"{len(spdr_keys) / len(spdr)}")
np.random.shuffle(spdr_keys)
dataset_ids = spdr_keys[:10_000]
len(dataset_ids)

0.026325985653699147


10000

In [None]:
import pandas as pd
from proteinclip.gpt import get_hf_embedding
from tqdm.auto import tqdm

dataset = []
for id in tqdm(dataset_ids, desc="Creating dataset"):

    data = spdr.get(id)
    data["id"] = id

    data['embedding'] = get_hf_embedding(data['function'])
    dataset.append(data)
dataset = pd.DataFrame(dataset)
dataset.to_parquet("protclip_dataset.parquet")

In [28]:
dataset.head()

Unnamed: 0,organism,organism_id,name,evidence,function,id,embedding
0,Homo sapiens (Human),9606,CEP57L1,1,Centrosomal protein which may be required for ...,Q8IYX8,"[0.009208069, -0.030338794, 0.00073691126, 0.0..."
1,Homo sapiens (Human),9606,SRD5A1,1,Converts testosterone into 5-alpha-dihydrotest...,P18405,"[0.0073278393, 0.0021296088, 0.016226701, 0.01..."
2,Homo sapiens (Human),9606,GSTT1,1,Conjugation of reduced glutathione to a wide n...,P30711,"[-0.0251164, -0.011378617, 0.0071880626, 0.008..."
3,Homo sapiens (Human),9606,EDAR,1,"Receptor for EDA isoform A1, but not for EDA i...",Q9UNE0,"[-0.03433434, -0.0055075698, -0.01397321, 0.02..."
4,Homo sapiens (Human),9606,HAUS1,1,"Contributes to mitotic spindle assembly, maint...",Q96CS2,"[0.01762243, -0.017682446, 0.0021139714, -0.00..."
