In [45]:
from typing import Optional, Dict, Any, List
import requests
import time
from pydantic import BaseModel

class UniProtResult(BaseModel):
    primaryAccession: str           # e.g. "Q2M2I8"
    uniProtkbId: str                # e.g. "AAK1_HUMAN"
    sequence: str                   # e.g. "MALWMRLLPLLALLALWGPDPAAA..."
    geneName: Optional[str]         # e.g. "AAK1"
    RefSeq_id: Optional[str]        # e.g. "NM_014911"
    # EMBL_id: Optional[str]          # e.g. "AB028971"
    # potentially also CCDS_id

def search_uniprot_by_accession(uniprot_id: str) -> Optional[UniProtResult]:
    """
    Search UniProt by accession code.
    
    Args:
        uniprot_id: UniProt accession code to search for
        
    Returns:
        UniProtResult object with protein information or None if not found
    """
    # Check cache first
    # if uniprot_id in uniprot_cache:
    #     return uniprot_cache[uniprot_id]
    
    # UniProt API URL
    # e.g. https://rest.uniprot.org/uniprotkb/Q2M2I8.json
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    
    try:
        response = requests.get(url)
        
        # Respect rate limits
        time.sleep(0.2)
        
        if response.status_code == 200:
            data = response.json()
            
            # Extract relevant information
            EMBL_id = None
            refseq_id = None
            
            for cross_reference in data.get('uniProtKBCrossReferences', []):
                if EMBL_id and refseq_id:
                    break
                if cross_reference.get('database') == 'EMBL' and EMBL_id is None:
                    EMBL_id = cross_reference.get('id')
                elif cross_reference.get('database') == 'RefSeq' and refseq_id is None:
                    for property in cross_reference.get('properties', []):
                        if property.get('key') == 'NucleotideSequenceId':
                            refseq_id = property.get('value').split('.')[0]
                            continue

            result = UniProtResult(
                primaryAccession=data.get('primaryAccession', ''),
                uniProtkbId=data.get('uniProtkbId', ''),
                sequence=data.get('sequence', {}).get('value', ''),
                geneName=data.get('genes', [{}])[0].get('geneName', {}).get('value', None),
                RefSeq_id=refseq_id,
                # EMBL_id=EMBL_id
            )
            
            # Cache the result
            # uniprot_cache[uniprot_id] = result
            # save_cache()
            
            return result
        else:
            # Cache negative result
            # uniprot_cache[uniprot_id] = None
            # save_cache()
            return None
            
    except Exception as e:
        print(f"Error searching UniProt for {uniprot_id}: {e}")
        return None

search_uniprot_by_accession("Q9HVM7")

UniProtResult(primaryAccession='Q9HVM7', uniProtkbId='ISPH_PSEAE', sequence='MQIKLANPRGFCAGVDRAIEIVNRALDVFGPPIYVRHEVVHNKFVVDNLRQRGAIFVEELDQVPNNVIVIFSAHGVSQAVRKEAEGRGLKVFDATCPLVTKVHMEVVRYSRDGHECVLIGHEGHPEVEGTMGQYDASNGGAIYLVEDEADVAALEVRKPEALHYVTQTTLSMDDTSKVIDALRAKFPQIQGPRKNDICYATQNRQDAVKELADQCDMVLVVGSPNSSNSNRLRELAERMGTPAYLIDGAEDMQRGWFDGVRRIGITAGASAPEVLVRGVIAQLREWGASEEQELEGREENITFSMPKELRVKAL', geneName='ispH', RefSeq_id='NC_002516')

In [34]:
def search_uniprot_by_gene(gene_id: str) -> Optional[UniProtResult]:
    """
    Search UniProt by gene symbol.
    
    Args:
        gene_id: UniProt gene symbol to search for
        
    Returns:
        UniProtResult object with protein information or None if not found
    """
    # Check cache first
    # if gene_id in uniprot_cache:
    #     return uniprot_cache[gene_id]
    
    # UniProt API URL for gene search
    # e.g. https://rest.uniprot.org/uniprotkb/search?query=gene:AAK1+AND+reviewed:true
    url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_id}+AND+reviewed:true"
    
    try:
        response = requests.get(url)
        
        # Respect rate limits
        # time.sleep(0.2)
        
        if response.status_code == 200:
            results = response.json()
            
            # Check if we have results
            if results.get('results', []):
                data = results['results'][0]
                # Extract relevant information
                EMBL_id = None
                refseq_id = None
                
                for cross_reference in data.get('uniProtKBCrossReferences', []):
                    if EMBL_id and refseq_id:
                        break
                    if cross_reference.get('database') == 'EMBL' and EMBL_id is None:
                        EMBL_id = cross_reference.get('id')
                    elif cross_reference.get('database') == 'RefSeq' and refseq_id is None:
                        for property in cross_reference.get('properties', []):
                            if property.get('key') == 'NucleotideSequenceId':
                                refseq_id = property.get('value').split('.')[0]
                                continue

                result = UniProtResult(
                    primaryAccession=data.get('primaryAccession', ''),
                    uniProtkbId=data.get('uniProtkbId', ''),
                    sequence=data.get('sequence', {}).get('value', ''),
                    geneName=data.get('genes', [{}])[0].get('geneName', {}).get('value', None),
                    RefSeq_id=refseq_id,
                    EMBL_id=EMBL_id
                )
                
                # Cache the result
                # uniprot_cache[gene_id] = result
                # save_cache()
                
                return result
            else:
                # Cache negative result
                # uniprot_cache[gene_id] = None
                # save_cache()
                return None
        else:
            # Cache negative result
            # uniprot_cache[gene_id] = None
            # save_cache()
            return None
            
    except Exception as e:
        print(f"Error searching UniProt for gene {gene_id}: {e}")
        return None

search_uniprot_by_gene("Q16769")

In [46]:
from Bio import Entrez, SeqIO

Entrez.email = "robbe.claeys@ugent.be"

def get_cds_from_refseq(refseq_id: str) -> Dict[str, Any]:
    """
    Retrieve the coding sequence (CDS) and related information from a RefSeq ID.
    
    Args:
        refseq_id: The RefSeq ID (e.g., "NM_014911")
        
    Returns:
        Dictionary containing CDS sequence and metadata
    """
    # Fetch the record from NCBI
    try:
        handle = Entrez.efetch(db="nucleotide", id=refseq_id, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()

        # Respect rate limits
        # time.sleep(0.2)
        
        # Initialize result dictionary
        result = {
            "cds_sequences": [],
            "cds_translations": []
        }
        
        # Extract CDS features
        for feature in record.features:
            if feature.type == "CDS":
                # Get CDS sequence
                cds_sequence = feature.extract(record.seq)
                result["cds_sequences"].append(str(cds_sequence))
                
                # Get protein translation if available
                if "translation" in feature.qualifiers:
                    result["cds_translations"].append(feature.qualifiers["translation"][0])
        
        return result
    
    except Exception as e:
        print(f"Error fetching RefSeq record {refseq_id}: {str(e)}")
        return None

get_cds_from_refseq("NC_002516")

{'cds_sequences': [], 'cds_translations': []}

In [42]:
len("MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPAILNSSALRQIAEGTSISEMWQNDLQPLLIERYPGSPGSYAARQHIMQRIQRLQADWVLEIDTFLSQTPYGYRSFSNIISTLNPTAKRHLVLACHYDSKYFSHWNNRVFVGATDSAVPCAMMLELARALDKKLLSLKTVSDSKPDLSLQLIFFDGEEAFLHWSPQDSLYGSRHLAAKMASTPHPPGARGTSQLHGMDLLVLLDLIGAPNPTFPNFFPNSARWFERLQAIEHELHELGLLKDHSLEGRYFQNYSYGGVIQDDHIPFLRRGVPVLHLIPSPFPEVWHTMDDNEENLDESTIDNLNKILQVFVLEYLHL") * 3

len("GCAGGCGGAAGACACCGGCGCGTCGTGGGCACCCTCCACCTGCTGCTGCTGGTGGCCGCCCTGCCCTGGGCATCCAGGGGGGTCAGTCCGAGTGCCTCAGCCTGGCCAGAGGAGAAGAATTACCACCAGCCAGCCATTTTGAATTCATCGGCTCTTCGGCAAATTGCAGAAGGCACCAGTATCTCTGAAATGTGGCAAAATGACTTACAGCCATTGCTGATAGAGCGATACCCGGGATCCCCTGGAAGCTATGCTGCTCGTCAGCACATCATGCAGCGAATTCAGAGGCTTCAGGCTGACTGGGTCTTGGAAATAGACACCTTCTTGAGTCAGACACCCTATGGGTACCGGTCTTTCTCAAATATCATCAGCACCCTCAATCCCACTGCTAAACGACATTTGGTCCTCGCCTGCCACTATGACTCCAAGTATTTTTCCCACTGGAACAACAGAGTGTTTGTAGGAGCCACTGATTCAGCCGTGCCATGTGCAATGATGTTGGAACTTGCTCGTGCCTTAGACAAGAAACTCCTTTCCTTAAAGACTGTTTCAGACTCCAAGCCAGATTTGTCACTCCAGCTGATCTTCTTTGATGGTGAAGAGGCTTTTCTTCACTGGTCTCCTCAAGATTCTCTCTATGGGTCTCGACACTTAGCTGCAAAGATGGCATCGACCCCGCACCCACCTGGAGCGAGAGGCACCAGCCAACTGCATGGCATGGATTTATTGGTCTTATTGGATTTGATTGGAGCTCCAAACCCAACGTTTCCCAATTTTTTTCCAAACTCAGCCAGGTGGTTCGAAAGACTTCAAGCAATTGAACATGAACTTCATGAATTGGGTTTGCTCAAGGATCACTCTTTGGAGGGGCGGTATTTCCAGAATTACAGTTATGGAGGTGTGATTCAGGATGACCATATTCCATTTTTAAGAAGAGGTGTTCCAGTTCTGCATCTGATACCGTCTCCTTTCCCTGAAGTCTGGCACACCATGGATGACAATGAAGAAAATTTGGATGAATCAACCATTGACAATCTAAACAAAATCCTACAAGTCTTTGTGTTGGAATATCTTCATTTGTAA")

1083

In [22]:
import hashlib
from Bio.Blast import NCBIWWW, NCBIXML

TOP_N_MATCHES = 3

def search_by_blast(aa_sequence: str) -> Optional[str]:
    """
    Search for a protein using BLAST.
    
    Args:
        aa_sequence: Amino acid sequence to search for
        
    Returns:
        UniProt ID of best match or None if not found
    """
    # Check cache first
    # Use a hash of the sequence as the key to avoid long keys
    sequence_hash = hashlib.md5(aa_sequence.encode()).hexdigest()
    
    # if sequence_hash in blast_cache:
    #     return blast_cache[sequence_hash]
    
    try:      
        # Run BLAST search
        result_handle = NCBIWWW.qblast("blastp", "swissprot", aa_sequence)
        
        # Parse the results
        blast_record = NCBIXML.read(result_handle)
        
        # Check if we have any hits
        if blast_record.alignments:
            result = blast_record.alignments[0].accession
            print(blast_record.alignments[0])
            
            # Cache the results
            # blast_cache[sequence_hash] = results if results else None
            # save_cache()
                
            return result
        
        # If we get here, we couldn't find a match
        # blast_cache[sequence_hash] = None
        # save_cache()
        return None
        
    except Exception as e:
        print(f"Error searching by BLAST: {e}")
        return None

search_by_blast("MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQVTVDEVLAEGGFAIVFLVRTSNGMKCALKRMFVNNEHDLQVCKREIQIMRDLSGHKNIVGYIDSSINNVSSGDVWEVLILMDFCRGGQVVNLMNQRLQTGFTENEVLQIFCDTCEAVARLHQCKTPIIHRDLKVENILLHDRGHYVLCDFGSATNKFQNPQTEGVNAVEDEIKKYTTLSYRAPEMVNLYSGKIITTKADIWALGCLLYKLCYFTLPFGESQVAICDGNFTIPDNSRYSQDMHCLIRYMLEPDPDKRPDIYQVSYFSFKLLKKECPIPNVQNSPIPAKLPEPVKASEAAAKKTQPKARLTDPIPTTETSIAPRQRPKAGQTQPNPGILPIQPALTPRKRATVQPPPQAAGSSNQPGLLASVPQPKPQAPPSQPLPQTQAKQPQAPPTPQQTPSTQAQGLPAQAQATPQHQQQLFLKQQQQQQQPPPAQQQPAGTFYQQQQAQTQQFQAVHPATQKPAIAQFPVVSQGGSQQQLMQNFYQQQQQQQQQQQQQQLATALHQQQLMTQQAALQQKPTMAAGQQPQPQPAAAPQPAPAQEPAIQAPVRQQPKVQTTPPPAVQGQKVGSLTPPSSPKTQRAGHRRILSDVTHSAVFGVPASKSTQLLQAAAAEASLNKSKSATTTPSGSPRTSQQNVYNPSEGSTWNPFDDDNFSKLTAEELLNKDFAKLGEGKHPEKLGGSAESLIPGFQSTQGDAFATTSFSAGTAEKRKGGQTVDSGLPLLSVSDPFIPLQVPDAPEKLIEGLKSPDTSLLLPDLLPMTDPFGSTSDAVIEKADVAVESLIPGLEPPVPQRLPSQTESVTSNRTDSLTGEDSLLDCSLLSNPTTDLLEEFAPTAISAPVHKAAEDSNLISGFDVPEGSDKVAEDEFDPIPVLITKNPQGGHSRNSSGSSESSLPNLARSLLLVDQLIDL")

sp|Q2M2I8.3| RecName: Full=AP2-associated protein kinase 1; AltName: Full=Adaptor-associated kinase 1 [Homo sapiens]
           Length = 961



'Q2M2I8'