In [6]:
import requests
import time

INPUT_FILE = "/Users/saanviaima/Documents/GitHub/Hyrbid-Alignment/HMM-iter1-scores-table.txt"
OUTPUT_FASTA = "all_sequences.fasta"
NOT_FOUND_LOG = "not_found.txt"
ORGANISM_ID = 9606  #human
PAUSE_BETWEEN_REQUESTS = 0.3  

def extract_gene_names(file_path):
    def extract_middle_from_name(name):
        parts = name.split('_')
        if len(parts) >= 3:
            return parts[1]  
        elif len(parts) == 2:
            return parts[1] 
        return None

    gene_names = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():
                name = line.split()[0]
                gene = extract_middle_from_name(name)
                if gene:
                    gene_names.append(gene)
    return list(set(gene_names)) 


def get_uniprot_accession(gene_name):
    url = (
        f"https://rest.uniprot.org/uniprotkb/search?"
        f"query=gene:{gene_name}+AND+organism_id:{ORGANISM_ID}+AND+reviewed:true"
        f"&fields=accession&format=tsv"
    )
    r = requests.get(url)
    if r.status_code == 200:
        lines = r.text.strip().split('\n')
        if len(lines) > 1:
            return lines[1].split('\t')[0]  
    return None

def download_fasta(accession):
    url = f"https://rest.uniprot.org/uniprotkb/{accession}.fasta"
    r = requests.get(url)
    if r.status_code == 200:
        return r.text
    return None


gene_names = extract_gene_names(INPUT_FILE)
print(f"Found {len(gene_names)} unique gene names.")

with open(OUTPUT_FASTA, 'w') as fasta_out, open(NOT_FOUND_LOG, 'w') as log_out:
    for gene in gene_names:
        print(f"Fetching {gene}...", end='')
        accession = get_uniprot_accession(gene)
        if accession:
            sequence = download_fasta(accession)
            if sequence:
                fasta_out.write(sequence)
                print(f" ✓ ({accession})")
            else:
                print(" ✗ (FASTA not found)")
                log_out.write(f"{gene}\t{accession}\tFASTA not found\n")
        else:
            print(" ✗ (No accession)")
            log_out.write(f"{gene}\tNOT FOUND\n")
        time.sleep(PAUSE_BETWEEN_REQUESTS)


Found 484 unique gene names.
Fetching MARK2... ✓ (Q7KZI7)
Fetching GRK2... ✓ (P25098)
Fetching STK32B... ✓ (Q9NY57)
Fetching DCLK3... ✓ (Q9C098)
Fetching AKT1... ✓ (P31749)
Fetching SIK1B... ✗ (No accession)
Fetching MAPK11... ✓ (Q15759)
Fetching LIMK1... ✓ (P53667)
Fetching CSNK1G2... ✓ (P78368)
Fetching ARAF... ✓ (P10398)
Fetching LRRK1... ✓ (Q38SD2)
Fetching HUNK... ✓ (P57058)
Fetching PIM3... ✓ (Q86V86)
Fetching SPEG... ✓ (Q15772)
Fetching EPHB6... ✓ (O15197)
Fetching NTRK3... ✓ (Q16288)
Fetching SRPK1... ✓ (Q96SB4)
Fetching INSRR... ✓ (P14616)
Fetching ILK... ✓ (Q13418)
Fetching ALK... ✓ (Q9UM73)
Fetching EPHB3... ✓ (P54753)
Fetching STKLD1... ✓ (Q8NE28)
Fetching TTBK2... ✓ (Q6IQ55)
Fetching STK38... ✓ (Q15208)
Fetching CDK20... ✓ (Q8IZL9)
Fetching MAP3K19... ✓ (Q56UN5)
Fetching TNIK... ✓ (Q9UKE5)
Fetching CDK9... ✓ (P50750)
Fetching PKN3... ✓ (Q6P5Z2)
Fetching TNK2... ✓ (Q07912)
Fetching CSNK2A2... ✓ (P19784)
Fetching PBK... ✓ (Q96KB5)
Fetching IRAK1... ✓ (P51617)
Fetching ROS1..