In [1]:
from __future__ import annotations

%cd ../

/storage/ice1/0/9/qzheng75/NL2Prot_dev/NL2UniProt


In [2]:
import requests

base_url = "https://rest.uniprot.org/uniprotkb/search?query=reviewed:true+AND+organism_id:9606&format=tsv"
cols = [
    "accession",  # Entry
    "id",  # Entry Name
    "protein_name",  # Protein names
    "gene_names",  # Gene Names
    "length",  # Length
    "cc_pharmaceutical",  # Pharmaceutical use
    "annotation_score",  # Annotation
    "go",  # Gene Ontology (GO)
    "go_f",  # GO molecular function
    "go_p",  # GO biological process
    "mass",  # Mass
    "sequence",  # Sequence
    "cc_catalytic_activity",  # Catalytic activity
    "cc_interaction",  # Interacts with
    "cc_developmental_stage",  # Developmental stage
    "cc_disease",  # Involvement in disease
    "ft_mutagen",  # Mutagenesis
    "cc_biotechnology",  # Biotechnological use
    "cc_allergen",  # Allergenic Properties
    "cc_subcellular_location",  # Subcellular location
    "keyword",  # Keywords
]
with_return = base_url + "&size=200&fields=" + ",".join(cols)

In [3]:
import re
import time

page_number = 1
results_written = 0
max_page = 1e5
next_url = with_return

with open("uniprot_results.tsv", "w", encoding="utf-8") as f:
    while next_url and page_number < max_page:
        try:
            # Make the request with headers to get Link header
            response = requests.get(next_url)
            response.raise_for_status()

            # Get the response content
            content = response.text

            # Write headers only for the first page
            if page_number == 1:
                f.write(content)
            else:
                # For subsequent pages, skip the header
                lines = content.split("\n")
                if len(lines) > 1:  # Make sure there's data
                    f.write("\n".join(lines[1:]))

            # Update results count
            results_written += len(content.split("\n")) - 2  # Subtract 1 for header

            # Get the Link header
            link_header = response.headers.get("Link", "")

            # Extract next URL if it exists
            next_url = None
            if 'rel="next"' in link_header:
                # Use regex to extract the URL between < and >
                match = re.search(r'<([^>]+)>;\s*rel="next"', link_header)
                if match:
                    next_url = match.group(1)

            if results_written % 1000 == 0:
                print(f"Processed page {page_number}, Total results: {results_written}")

            # Add a small delay to avoid hitting rate limits
            time.sleep(1)
            page_number += 1

        except requests.exceptions.RequestException as e:
            print(f"Error occurred on page {page_number}: {e}")
            # Implement exponential backoff
            retry_delay = min(300, 2 ** (page_number - 1))  # Cap at 5 minutes
            print(f"Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)
            continue

        except Exception as e:
            print(f"Unexpected error occurred: {e}")
            break

Processed page 5, Total results: 1000
Processed page 10, Total results: 2000
Processed page 15, Total results: 3000
Processed page 20, Total results: 4000
Processed page 25, Total results: 5000
Processed page 30, Total results: 6000
Processed page 35, Total results: 7000
Processed page 40, Total results: 8000
Processed page 45, Total results: 9000
Processed page 50, Total results: 10000
Processed page 55, Total results: 11000
Processed page 60, Total results: 12000
Processed page 65, Total results: 13000
Processed page 70, Total results: 14000
Processed page 75, Total results: 15000
Processed page 80, Total results: 16000
Processed page 85, Total results: 17000
Processed page 90, Total results: 18000
Processed page 95, Total results: 19000
Processed page 100, Total results: 20000


In [4]:
import pandas as pd

df = pd.read_csv("uniprot_results.tsv", sep="\t")
df.head(2)

Unnamed: 0,Entry,Entry Name,Protein names,Gene Names,Length,Pharmaceutical use,Annotation,Gene Ontology (GO),Gene Ontology (molecular function),Gene Ontology (biological process),...,Sequence,Catalytic activity,Interacts with,Developmental stage,Involvement in disease,Mutagenesis,Biotechnological use,Allergenic Properties,Subcellular location [CC],Keywords
0,A0A0C5B5G6,MOTSC_HUMAN,Mitochondrial-derived peptide MOTS-c (Mitochon...,MT-RNR1,16,,5.0,extracellular space [GO:0005615]; mitochondrio...,DNA binding [GO:0003677]; DNA-binding transcri...,activation of protein kinase activity [GO:0032...,...,MRWQEMGYIFYPRKLR,,,DEVELOPMENTAL STAGE: Circulating plasma levels...,,"MUTAGEN 5; /note=""E->A: Lack of enhanced glyco...",,,SUBCELLULAR LOCATION: Secreted {ECO:0000269|Pu...,DNA-binding;Mitochondrion;Nucleus;Osteogenesis...
1,A0A1B0GTW7,CIROP_HUMAN,Ciliated left-right organizer metallopeptidase...,CIROP LMLN2,788,,5.0,cytoplasm [GO:0005737]; membrane [GO:0016020];...,metal ion binding [GO:0046872]; metalloendopep...,cell adhesion [GO:0007155]; establishment of l...,...,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...,,,,"DISEASE: Heterotaxy, visceral, 12, autosomal (...",,,,SUBCELLULAR LOCATION: Membrane {ECO:0000255}; ...,Alternative splicing;Disease variant;Glycoprot...


In [5]:
len(df)

20421

# Process catalytic activity feature

In [6]:
import re


def extract_catalytic_activity_features(activity_string):
    # Compile all patterns
    if activity_string is None or str(activity_string) == "nan":
        return {}
    patterns = {
        "reaction": re.compile(r"Reaction=(.+?);"),
        "ec": re.compile(r"EC=(\d+\.\d+\.\d+\.\d+);"),
    }

    # Extract all features
    features = {}
    for key, pattern in patterns.items():
        match = pattern.search(activity_string)
        if match:
            features[key] = match.group(1)

    return features

In [7]:
def clean_biotechnology_text(text):
    if str(text) == "nan":
        return "Not Available"
    # Remove "BIOTECHNOLOGY: " prefix and trailing "{ECO:...}"
    pattern = r"^BIOTECHNOLOGY:\s*(.*?)\s*\{ECO:.*\}\.?$"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return text

In [8]:
def clean_subcellular_location(text):
    if str(text) == "nan":
        return "Not Available"
    text = re.sub(r"^SUBCELLULAR LOCATION:\s*", "", text)
    text = re.sub(r"\{[^}]*\}", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\.\s*\.", ".", text)
    text = text.replace(".", ";").replace(" ;", ";").strip().rstrip(".")
    return text

In [9]:
def clean_go_terms(text):
    if str(text) == "nan":
        return "Not Available"
    cleaned_text = re.sub(r"\s*\[GO:\d+\]", "", text)
    terms = [term.strip() for term in cleaned_text.split(";")]
    return "; ".join(terms)

In [10]:
all_entry_names = set(df["Entry"])


def clean_interaction_text(text):
    if str(text) == "nan":
        return "Not Available"
    process_s = "; ".join([s for s in text.split("; ") if s in all_entry_names])
    return process_s.rstrip("; ")

In [11]:
def clean_development_text(text):
    if str(text) == "nan":
        return "Not Available"
    pattern = r"DEVELOPMENTAL STAGE:\s*(.*?)\s*\{ECO:.*\}\."
    match = re.search(pattern, text)
    if match:
        return match.group(1).strip()
    return text

In [12]:
def parse_mutagen_entries(text):
    if text is None or str(text) == "nan":
        return []
    entries = re.split(r"MUTAGEN\s+", text)
    entries = [entry.strip() for entry in entries if entry.strip()]

    parsed_entries = []
    for entry in entries:
        position_match = re.match(r"(\d+\.\.\d+);", entry)
        note_match = re.search(r'/note="([^"]+)"', entry)

        if position_match and note_match:
            position = position_match.group(1)
            note = note_match.group(1)
            parsed_entries.append({"mutagen": position, "note": note})

    return parsed_entries

In [13]:
df["Catalytic activity"] = df["Catalytic activity"].apply(
    extract_catalytic_activity_features
)
df["Gene Ontology (biological process)"] = df[
    "Gene Ontology (biological process)"
].replace(float("nan"), "Not Available")
df["Biotechnological use"] = df["Biotechnological use"].apply(clean_biotechnology_text)
df["Involvement in disease"] = df["Involvement in disease"].replace(
    float("nan"), "Not Available"
)
df["Allergenic Properties"] = df["Allergenic Properties"].replace(
    float("nan"), "Not Available"
)
df["Pharmaceutical use"] = df["Pharmaceutical use"].replace(
    float("nan"), "Not Available"
)
df["Subcellular location [CC]"] = df["Subcellular location [CC]"].apply(
    clean_subcellular_location
)
df["Gene Ontology (GO)"] = df["Gene Ontology (GO)"].apply(clean_go_terms)
df["Gene Ontology (molecular function)"] = df[
    "Gene Ontology (molecular function)"
].apply(clean_go_terms)
df["Interacts with"] = df["Interacts with"].apply(clean_interaction_text)
df["Developmental stage"] = df["Developmental stage"].apply(clean_development_text)
df["Mutagenesis"] = df["Mutagenesis"].apply(parse_mutagen_entries)
df = df[df["Annotation"] >= 4].drop(columns=["Annotation"])
df = df[df["Sequence"].str.len() <= 1022]

In [17]:
df.to_csv("raw_data/new_uniprot_processed.tsv", sep="\t", index=False)

In [14]:
len(df)

14378

In [2]:
%cd ../

import pandas as pd

df = pd.read_csv("raw_data/new_uniprot_processed.tsv", sep="\t")
df.head(2)

/storage/ice1/0/9/qzheng75/NL2Prot_dev/NL2UniProt


Unnamed: 0,Entry,Entry Name,Protein names,Gene Names,Length,Pharmaceutical use,Gene Ontology (GO),Gene Ontology (molecular function),Gene Ontology (biological process),Mass,Sequence,Catalytic activity,Interacts with,Developmental stage,Involvement in disease,Mutagenesis,Biotechnological use,Allergenic Properties,Subcellular location [CC],Keywords
0,A0A0C5B5G6,MOTSC_HUMAN,Mitochondrial-derived peptide MOTS-c (Mitochon...,MT-RNR1,16,Not Available,extracellular space; mitochondrion; nucleus; D...,DNA binding; DNA-binding transcription factor ...,activation of protein kinase activity [GO:0032...,2175,MRWQEMGYIFYPRKLR,{},Not Available,Circulating plasma levels decrease with age wh...,Not Available,"[{'mutagen': '11..14', 'note': 'YPRK->AAAA: Ab...",Not Available,Not Available,Secreted; Mitochondrion; Nucleus; Note=Translo...,DNA-binding;Mitochondrion;Nucleus;Osteogenesis...
1,A0A1B0GTW7,CIROP_HUMAN,Ciliated left-right organizer metallopeptidase...,CIROP LMLN2,788,Not Available,cytoplasm; membrane; metal ion binding; metall...,metal ion binding; metalloendopeptidase activi...,cell adhesion [GO:0007155]; establishment of l...,85397,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...,{},Not Available,Not Available,"DISEASE: Heterotaxy, visceral, 12, autosomal (...",[],Not Available,Not Available,Membrane; Single-pass type I membrane protein;,Alternative splicing;Disease variant;Glycoprot...


In [5]:
names = df["Entry"].tolist()
sequences = df["Sequence"].tolist()

In [6]:
with open("raw_data/fasta_files/latest_human_seqs.fasta", "w") as fasta_file:
    for name, sequence in zip(names, sequences):
        fasta_file.write(f">{name}\n{sequence}\n")