Author: Sophie-Christine Porak / ChatGPT

This script will count the number of tiles per protein segment therefore serving as an estimate of overall diversity at different protein regions. 
The absolute tile counts are being represented as a plot, where on the x-axis we have the position of the protein (amino acid position) and on the 
y-axis we have the absolute count of tiles for that position.

As the input is a nucleotide sequence, it needs to be translated to represent the amino acid sequence (or just divide by 3, I also know that each tile
represents 46 amino acids).

In [None]:
from Bio import SeqIO
from Bio import Entrez
from collections import defaultdict
import re

Entrez.email = "sophie.porak@ucsf.edu" #required for ncbi access

fasta_file = "input fasta file"

#create a dictionary {(protein, organism): {tile_position: count}}
tile_counts = defaultdict(lambda: defaultdict(int))

#store lookup results to avoid redundant API calls 
accession_lookup = {}

#function to fetch protein name and organism from NCBI
def fetch_info(accession):
    if accession in accession_lookup:
        return accession_lookup[accession]

    try: 
        handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()

        organism = record.annotations["organism"]
        protein_name = "unknown"
        for feature in record.features:
            if feature.type == "CDS":
                protein_name = feature.qualifiers.get("product", ["unknown"])[0]
                break
        
        accession_lookup[accession] == (protein_name, organism)
        return protein_name, organism
    except Exception as e:
        print(f"Failed to fetch {accession}: {e}")
        return "unknown", "unknown"
    
# parse the FASTA headers
with open(fasta_file) as f:
    for line in f:
        if not line.startswith(">"):
            continue 
    
        #example header >WFG38034.1_1_1
        header = line.strip()[1:].split()[0] #get first field only
        match = re.match(r"([A-Z0-9.]+)_(\d+)_(\d+)", header)
        if not match:
            print(f"Skipped header: {header}")
            continue

        accession, tile_pos, _ = match.groups()
        tile_pos = int(tile_pos)

        protein_name, organism = fetch_info(accession)
        tile_counts[(protein_name, organism)][tile_pos] += 1 

#print results
for (protein, organism), positions in tile_counts.items():
    print(f"\nProtein: {protein} | Organism: {organism}")
    for pos, count in sorted(positions.items()):
        print(f" Tile Position {pos}: {count} tile(s)")