In [4]:
import pandas as pd
from itertools import product

# Define the IUPAC nucleotide code dictionary
iupac_code = {
    "A": ["A"],
    "C": ["C"],
    "G": ["G"],
    "T": ["T"],
    "R": ["A", "G"],
    "Y": ["C", "T"],
    "K": ["G", "T"],
    "M": ["A", "C"],
    "S": ["G", "C"],
    "W": ["A", "T"],
    "B": ["C", "G", "T"],
    "D": ["A", "G", "T"],
    "H": ["A", "C", "T"],
    "V": ["A", "C", "G"],
    "N": ["A", "C", "G", "T"]
}

# Define the codon to residue dictionary
codon_to_residue = {
    "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
    "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
    "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
    "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
    "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
    "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
    "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
    "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
    "TAT": "Y", "TAC": "Y", "TAA": "/", "TAG": "/",
    "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
    "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
    "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
    "TGT": "C", "TGC": "C", "TGA": "/", "TGG": "W",
    "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
    "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
    "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G"
}

# Define residue classification
residue_classification = {
    "hydrophobic": ["A", "V", "I", "L", "M", "F", "W"],
    "polar": ["S", "T", "Y", "N", "Q"],
    "positives": ["K", "R", "H"],
    "negatives": ["D", "E"],
    "special": ["/", "P", "G", "C"]
}

# Function to classify residues with descriptions
def classify_residues_with_descriptions(residues):
    classifications = []
    if "/" in residues:
        classifications.append("Stop codon")

    if all(residue in residue_classification["hydrophobic"] for residue in residues):
        classifications.append("All hydrophobic")
        return "All hydrophobic"
    elif not any(residue in residue_classification["hydrophobic"] for residue in residues):
        classifications.append("No hydrophobic")

    if all(residue in residue_classification["polar"] for residue in residues):
        classifications.append("All polar")
        return "All polar"
    elif not any(residue in residue_classification["polar"] for residue in residues):
        classifications.append("No polar")

    if all(residue in residue_classification["positives"] for residue in residues):
        classifications.append("All positives")
        return "All positives"
    elif not any(residue in residue_classification["positives"] for residue in residues):
        classifications.append("No positives")

    if all(residue in residue_classification["negatives"] for residue in residues):
        classifications.append("All negatives")
        return "All negatives"
    elif not any(residue in residue_classification["negatives"] for residue in residues):
        classifications.append("No negatives")

    return ", ".join(classifications) if classifications else ""


# Generate all possible IUPAC codons
iupac_keys = list(iupac_code.keys())
all_iupac_codons = [''.join(codon) for codon in product(iupac_keys, repeat=3)]

# Calculate possible amino acids and their classifications for each IUPAC codon
results = []
for iupac_codon in all_iupac_codons:
    # Expand the IUPAC codon to all possible standard codons
    expanded_codons = [
        ''.join(codon) for codon in product(
            iupac_code[iupac_codon[0]],
            iupac_code[iupac_codon[1]],
            iupac_code[iupac_codon[2]]
        )
    ]
    # Map each expanded codon to its residue
    residues = list(set(codon_to_residue.get(codon, None) for codon in expanded_codons if codon in codon_to_residue))
    residues.sort()

    classification = classify_residues_with_descriptions(residues)
    results.append({"IUPAC Codon": iupac_codon, "Possible Amino Acids": residues, "Description": classification})

# Convert results into a DataFrame
df_results = pd.DataFrame(results)


# Update "Possible Amino Acids" column to display as a comma-separated string
df_results["Possible Amino Acids"] = df_results["Possible Amino Acids"].apply(lambda x: ",".join(x))

# Group by "Possible Amino Acids" and merge IUPAC Codons into a single cell
grouped_results = df_results.groupby("Possible Amino Acids").agg({
    "IUPAC Codon": lambda x: ", ".join(x),
    "Description": "first"  # Keep the first description for simplicity
}).reset_index()

# Rename columns for clarity after grouping
grouped_results.rename(columns={"IUPAC Codon": "IUPAC Codons"}, inplace=True)

#Save to a CSV file
grouped_results.to_csv("tailored_codons.csv", index=False)

#Save the iupac_code (now in a dict) as a CSV
df_iupac_code = pd.DataFrame(iupac_code.items(), columns=["IUPAC Codon", "Nucleotides"])
df_iupac_code["Nucleotides"] = df_iupac_code["Nucleotides"].apply(lambda x: ",".join(x))
df_iupac_code.to_csv("iupac_code.csv", index=False)

#Save the codon_to_residue (now in a dict) as a CSV
df_codon_to_residue = pd.DataFrame(codon_to_residue.items(), columns=["Codon", "Residue"])
df_codon_to_residue.to_csv("genetic_code.csv", index=False)