In [1]:
import requests

def fetch_protein_data(query):

    base_url = "https://rest.uniprot.org/uniprotkb/search"
    params = {"query": query,"format": "json","size": 100}
    
    response = requests.get(base_url, params=params)

    if response.status_code == 200:

        data = response.json()
        results = data.get("results", [])
        
        if not results: return {"error": "No protein data found for the given query."}

        highest_annotation_result = None
        highest_annotation_score = -1
        
        for result in results:

            annotation_score = result.get("annotationScore", 0)

            if annotation_score > highest_annotation_score:
                highest_annotation_score = annotation_score
                highest_annotation_result = result
        
        result = highest_annotation_result
        result = {
            "UniProt ID": result.get("primaryAccession", "N/A"),
            "Protein Name": result.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", {}).get("value", "N/A"),
            "Gene Name": result.get("genes", [{}])[0].get("geneName", {}).get("value", "N/A"),
            "Protein Length": result.get("sequence", {}).get("length", "N/A"),
            "Annotation Score": result.get("annotationScore", "N/A"),
            "Protein Sequence": result.get("sequence", {}).get("value", "N/A")
        }
        return result
    
    else:
        return {
            "error": f"Failed to fetch data. HTTP Status Code: {response.status_code}",
            "response_text": response.text
        }

query = "AT4G40030.2"
query = "AT5G59870.1"
result = fetch_protein_data(query)

display(result)

# Print all results
if "error" in result:
    print(result["error"])
    if "response_text" in result:
        print(f"API Response: {result['response_text']}")
else:
    print("Results:")
    for key, value in result.items():
        print(f"  {key}: {value}")
    print()
    

{'UniProt ID': 'Q9FJE8',
 'Protein Name': 'Probable histone H2A.7',
 'Gene Name': 'N/A',
 'Protein Length': 150,
 'Annotation Score': 5.0,
 'Protein Sequence': 'MESTGKVKKAFGGRKPPGAPKTKSVSKSMKAGLQFPVGRITRFLKKGRYAQRLGGGAPVYMAAVLEYLAAEVLELAGNAARDNKKSRIIPRHLLLAIRNDEELGKLLSGVTIAHGGVLPNINSVLLPKKSATKPAEEKATKSPVKSPKKA'}

Results:
  UniProt ID: Q9FJE8
  Protein Name: Probable histone H2A.7
  Gene Name: N/A
  Protein Length: 150
  Annotation Score: 5.0
  Protein Sequence: MESTGKVKKAFGGRKPPGAPKTKSVSKSMKAGLQFPVGRITRFLKKGRYAQRLGGGAPVYMAAVLEYLAAEVLELAGNAARDNKKSRIIPRHLLLAIRNDEELGKLLSGVTIAHGGVLPNINSVLLPKKSATKPAEEKATKSPVKSPKKA



In [2]:
import pandas as pd
from tqdm import tqdm  

file_path = '..\\data\\MS_data_efs+WT.xlsx' 

MS_source_data = pd.ExcelFile(file_path)

row_names = ['Locus', 'description', 'MS_detect_times', 
            'Uniprot_ID', 'Protein_Name', 'seq_length', 'Protein_sequence'] 

result_sheets = {}

new_row_names = ['Uniprot_ID', 'Protein_Name', 'seq_length', 'Protein_sequence']

uniprot_col_name = {
    'Uniprot_ID': 'UniProt ID',
    'Protein_Name': 'Protein Name',
    'seq_length': 'Protein Length',
    'Protein_sequence': 'Protein Sequence'
}
slot = -1
Ms_lower_bound = [2, 10, 10, 2, 10, 10]

for sheet_name in MS_source_data.sheet_names:

    print("pulling protein information from https://rest.uniprot.org/uniprotkb/search...")
    print(f"now dealing with sheet {sheet_name}")
    df = pd.read_excel(MS_source_data, sheet_name=sheet_name, header = None)

    for name in new_row_names: df[name] = None
    
    df.columns = row_names
    fetch_result = {}
    slot += 1
    
    for index, row in tqdm(df.iterrows()):
        if (row['MS_detect_times']>=Ms_lower_bound[slot]):
            try:
                fetch_result = fetch_protein_data(row['Locus'])
            except Exception as e1: 
                print(f"An error occurred when query 1st: {e1}")
                print(f"current index={index}")
                print("try second time")
                try:
                    fetch_result = fetch_protein_data(row['Locus'])
                except Exception as e2: 
                    print(f"An error occurred when query 2nd: {e2}")
                    print(f"current index={index}")
                    print("query failed")
                    display(row)

            for item_name in new_row_names:
                df.loc[index, item_name] = fetch_result.get(uniprot_col_name[item_name], 'None')

    result_sheets[sheet_name] = df

with pd.ExcelWriter('..\\result\\MS_data_annotation_2.xlsx', engine='openpyxl') as writer:  
    for sheet_name, df in result_sheets.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

print("all sheets have been exported successfully")
print("the first error in last run is AT1G26630.1")

pulling protein information from https://rest.uniprot.org/uniprotkb/search...
now dealing with sheet efs all


2349it [43:47,  1.12s/it]


pulling protein information from https://rest.uniprot.org/uniprotkb/search...
now dealing with sheet efs H3K36me3抗体


1512it [00:00, 34201.73it/s]


pulling protein information from https://rest.uniprot.org/uniprotkb/search...
now dealing with sheet efs 肽段


1422it [00:00, 31809.43it/s]


pulling protein information from https://rest.uniprot.org/uniprotkb/search...
now dealing with sheet WT all


385it [11:48,  1.79s/it]

An error occurred when query 1st: HTTPSConnectionPool(host='rest.uniprot.org', port=443): Max retries exceeded with url: /uniprotkb/search?query=AT2G33450.1&format=json&size=100 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))
current index=385
try second time


495it [15:12,  1.61s/it]

An error occurred when query 1st: HTTPSConnectionPool(host='rest.uniprot.org', port=443): Max retries exceeded with url: /uniprotkb/search?query=AT1G20960.2&format=json&size=100 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))
current index=495
try second time


1849it [31:56,  1.04s/it]


pulling protein information from https://rest.uniprot.org/uniprotkb/search...
now dealing with sheet WT H3K36me3抗体


1158it [00:00, 31435.29it/s]


pulling protein information from https://rest.uniprot.org/uniprotkb/search...
now dealing with sheet WT 肽段


1185it [00:00, 30940.88it/s]


all sheets have been exported successfully
the first error in last run is AT1G26630.1
