# Genetic Code Exploration Helper

## Initialization

In [None]:
# Imports and reference table definitions

import ipywidgets as widgets
from IPython.display import display, clear_output
from tabulate import tabulate
from matplotlib import pyplot as plt
from matplotlib.patches import Patch
import numpy as np

# Suppress potential divide-by-zero warnings if an AA has 0 counts (though unlikely with validation)
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# H. sapiens Codon Usage Table
h_sapiens = {
    'A': {'codons': ['GCU', 'GCC', 'GCA', 'GCG'], 'frequencies': [0.27, 0.4, 0.23, 0.1]},
    'R': {'codons': ['CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], 'frequencies': [0.08, 0.19, 0.11, 0.21, 0.21, 0.2]},
    'N': {'codons': ['AAU', 'AAC'], 'frequencies': [0.47, 0.53]},
    'D': {'codons': ['GAU', 'GAC'], 'frequencies': [0.46, 0.54]},
    'C': {'codons': ['UGU', 'UGC'], 'frequencies': [0.46, 0.54]},
    'E': {'codons': ['GAA', 'GAG'], 'frequencies': [0.42, 0.58]},
    'Q': {'codons': ['CAA', 'CAG'], 'frequencies': [0.27, 0.73]},
    'G': {'codons': ['GGU', 'GGC', 'GGA', 'GGG'], 'frequencies': [0.16, 0.34, 0.25, 0.25]},
    'H': {'codons': ['CAU', 'CAC'], 'frequencies': [0.42, 0.58]},
    'I': {'codons': ['AUU', 'AUC', 'AUA'], 'frequencies': [0.36, 0.47, 0.17]},
    'L': {'codons': ['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG'], 'frequencies': [0.07, 0.13, 0.13, 0.2, 0.07, 0.4]},
    'K': {'codons': ['AAA', 'AAG'], 'frequencies': [0.43, 0.57]},
    'M': {'codons': ['AUG'], 'frequencies': [1.0]},
    'F': {'codons': ['UUU', 'UUC'], 'frequencies': [0.46, 0.54]},
    'P': {'codons': ['CCU', 'CCC', 'CCA', 'CCG'], 'frequencies': [0.29, 0.32, 0.28, 0.11]},
    'S': {'codons': ['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC'], 'frequencies': [0.18, 0.22, 0.15, 0.06, 0.15, 0.24]},
    'T': {'codons': ['ACU', 'ACC', 'ACA', 'ACG'], 'frequencies': [0.25, 0.36, 0.28, 0.11]},
    'W': {'codons': ['UGG'], 'frequencies': [1]},
    'Y': {'codons': ['UAU', 'UAC'], 'frequencies': [0.44, 0.56]},
    'V': {'codons': ['GUU', 'GUC', 'GUA', 'GUG'], 'frequencies': [0.18, 0.24, 0.12, 0.46]},
    # Add stop codons if desired
    # '*': {'codons': ['UAA', 'UAG', 'UGA'], 'frequencies': [0.3, 0.3, 0.4]}
}

# E. coli Codon Usage Table
e_coli = {
    'A': {'codons': ['GCU', 'GCC', 'GCA', 'GCG'], 'frequencies': [0.18, 0.26, 0.23, 0.33]},
    'R': {'codons': ['CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], 'frequencies': [0.36, 0.36, 0.07, 0.11, 0.07, 0.03]},
    'N': {'codons': ['AAU', 'AAC'], 'frequencies': [0.49, 0.51]},
    'D': {'codons': ['GAU', 'GAC'], 'frequencies': [0.63, 0.37]},
    'C': {'codons': ['UGU', 'UGC'], 'frequencies': [0.46, 0.54]},
    'E': {'codons': ['GAA', 'GAG'], 'frequencies': [0.68, 0.32]},
    'Q': {'codons': ['CAA', 'CAG'], 'frequencies': [0.34, 0.66]},
    'G': {'codons': ['GGU', 'GGC', 'GGA', 'GGG'], 'frequencies': [0.35, 0.37, 0.13, 0.15]},
    'H': {'codons': ['CAU', 'CAC'], 'frequencies': [0.57, 0.43]},
    'I': {'codons': ['AUU', 'AUC', 'AUA'], 'frequencies': [0.5, 0.39, 0.11]},
    'L': {'codons': ['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG'], 'frequencies': [0.14, 0.13, 0.12, 0.1, 0.04, 0.47]},
    'K': {'codons': ['AAA', 'AAG'], 'frequencies': [0.74, 0.26]},
    'M': {'codons': ['AUG'], 'frequencies': [1]},
    'F': {'codons': ['UUU', 'UUC'], 'frequencies': [0.58, 0.42]},
    'P': {'codons': ['CCU', 'CCC', 'CCA', 'CCG'], 'frequencies': [0.18, 0.13, 0.2, 0.49]},
    'S': {'codons': ['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC'], 'frequencies': [0.17, 0.15, 0.14, 0.14, 0.16, 0.24]},
    'T': {'codons': ['ACU', 'ACC', 'ACA', 'ACG'], 'frequencies': [0.19, 0.4, 0.17, 0.24]},
    'W': {'codons': ['UGG'], 'frequencies': [1]},
    'Y': {'codons': ['UAU', 'UAC'], 'frequencies': [0.59, 0.41]},
    'V': {'codons': ['GUU', 'GUC', 'GUA', 'GUG'], 'frequencies': [0.28, 0.2, 0.17, 0.35]},
     # '*': {'codons': ['UAA', 'UAG', 'UGA'], 'frequencies': [0.6, 0.1, 0.3]}
}

# Map for dropdown selection
reference_tables = {
    "H. sapiens": h_sapiens,
    "E. coli": e_coli
}

# Standard Genetic Code Reference String (RNA format)
homo_ref = "UUU F UCU S UAU Y UGU C UUC F UCC S UAC Y UGC C UUA L UCA S UAA * UGA * UUG L UCG S UAG * UGG W CUU L CCU P CAU H CGU R CUC L CCC P CAC H CGC R CUA L CCA P CAA Q CGA R CUG L CCG P CAG Q CGG R AUU I ACU T AAU N AGU S AUC I ACC T AAC N AGC S AUA I ACA T AAA K AGA R AUG M ACG T AAG K AGG R GUU V GCU A GAU D GGU G GUC V GCC A GAC D GGC G GUA V GCA A GAA E GGA G GUG V GCG A GAG E GGG G"

# Populate list of lists w/ codon & amino acid per sub-list with reference codons
ref_cod = []
i = 0
while i < len(homo_ref):
    # Check if there's enough string left for a codon and AA
    if i + 4 < len(homo_ref):
        ref_cod.append([homo_ref[i:i+3], homo_ref[i+4]])
    i += 6 # Move to the next codon entry

# Define stop codons explicitly
stop_codons = {"UAA", "UAG", "UGA"}

## Analysis Block

In [None]:
# Main Analysis Function

plt.close('all') # Clear everything

NO_COMP_VALUE = "__NO_COMPARISON__" # Placeholder in dropdown list if no comparison is required

def analyze_sequence(project, input_seq, max_font_size, ref_table, ref_table_name): # ref_table is now dict or NO_COMP_VALUE
    """
    Analyzes the codon usage of the input sequence, optionally comparing against a reference table.

    Args:
        input_seq (str): The raw nucleic acid sequence input by the user.
        ref_table (dict or str): The chosen reference codon usage dictionary (e.g., h_sapiens)
                                 or the placeholder string NO_COMP_VALUE if no comparison is selected.
        ref_table_name (str): The name of the reference table (e.g., "H. sapiens")
                              or "No Comparison".
    """
    # --- 1. Input Processing and Validation ---
    # Modify this part if visual inclusion of stop codons is desired
    print("--- Processing input sequence ---")
    if not input_seq.strip():
        print("ERROR: Input sequence is empty.")
        return

    query = input_seq.strip().upper().replace(" ", "").replace("\n", "").replace("\r", "")
    query = query.replace("T", "U")

    valid_chars = set("ACGU")
    if not set(query).issubset(valid_chars):
        print("ERROR: Illegal characters detected. Input must be a nucleotide sequence (A, C, G, T/U).")
        invalid_found = set(query) - valid_chars
        print(f"   Invalid characters found: {', '.join(sorted(list(invalid_found)))}")
        return

    lengthmod = len(query) % 3
    if lengthmod != 0:
        query = query[:-lengthmod]
        print(f"WARNING: Trimmed incomplete 3' codon ({lengthmod} bases removed).")

    seq_cods = [] # Break cleaned input sequence into list of codons, check for stop
    stop_flag = False
    i = 0
    while i < len(query):
        codon = query[i:i+3]
        if codon in stop_codons:
            print(f"WARNING: Stop codon '{codon}' found at base {i+1}/{len(query)}. Sequence will be truncated.")
            query = query[:i]
            stop_flag = True
            break
        seq_cods.append(codon)
        i += 3

    num_codons_analyzed = len(seq_cods)
    print(f"Sequence length (after trimming): {len(query)} bases")
    print(f"Stop codons found: {stop_flag}")
    print(f"Codons entering analysis: {num_codons_analyzed}")
    print("-" * 30 + "\n")


    # --- 2. Codon/AA Counting ---
    cod_counter = []
    for codon_aa_pair in ref_cod:
         if codon_aa_pair[1] != '*':
            cod_counter.append([codon_aa_pair[0], 0, codon_aa_pair[1]]) # Construct counter list of lists

    for codon in seq_cods:
        for entry in cod_counter:
            if entry[0] == codon:
                entry[1] += 1
                break
    # cod_counter now contains counts of all codons in the input sequence

    # --- 3. Frequency Calculation and Formatting ---
    present = [entry for entry in cod_counter if entry[1] != 0] # Aggregate codons with counts
    if not present:
         print("ERROR: No codons matching the standard genetic code were counted in the input.")
         return

    # Sort AAs and their corresponding codons - MAYBE JUST PRESORT REF_COD?
    sorted_present = []
    aa_ord = sorted(list(set(entry[2] for entry in present)))
    for aa in aa_ord:
        for entry in present:
            if aa == entry[2]:
                sorted_present.append(entry)

    degeneracy = [] # This will be a list of dictionaries with each dict containing AA, unique codon count, and codons with absolute counts
    temp_present = sorted_present[:] # Shallow copy
    while temp_present:
        current_aa = temp_present[0][2]
        count = 0
        codons_for_aa = []
        indices_to_remove = []
        for i, entry in enumerate(temp_present):
             if entry[2] == current_aa:
                 count += 1
                 codons_for_aa.append(entry[0:2]) # Store codon and count
                 indices_to_remove.append(i)
        if codons_for_aa:
             degeneracy.append({'aa': current_aa, 'count': count, 'codons': codons_for_aa})
        for i in sorted(indices_to_remove, reverse=True): # Reverse to prevent index shifting
            del temp_present[i]
        
    anal_raw = []       # Final analysis result for plotting
    anal_pretty = []    # Final analysis result for tabular output
    for dic in degeneracy:
        aa = dic['aa']
        codon_entries = dic['codons']
        codons_list = [entry[0] for entry in codon_entries]
        counts_list = [entry[1] for entry in codon_entries]
        cumulative = sum(counts_list)
        frequencies = [count / cumulative for count in counts_list] # Calculate relativecodon freqs
        anal_pretty.append([
            aa,
            ", ".join(codons_list),
            ", ".join([f"{r:.2f}" for r in frequencies])
        ])
        anal_raw.append([aa, codons_list, frequencies])
   
    # --- 4. Output Table ---
    print("--- Codon Usage Frequency in Input Sequence ---")
    col_names = ["AA", "Codons Found", "Observed Frequencies"]
    print(tabulate(anal_pretty, headers=col_names, tablefmt="grid"))
    print("-" * 30 + "\n")

    # --- 5. Output Plot ---
    print("--- Generating codon usage plot ---")

    amino_acids = [item[0] for item in anal_raw] # Extract amino acids for x-axis ticks

    # Change plot colors here, if desired
    colors_list = ['#8986e5', '#f6786c', '#36b600', '#00bfc3', '#9690fe', '#e66bf3']
    colors_list_ref = ['#b8b6ef', '#faada7', '#88d366', '#66c5e8', '#bfbcff', '#f1a6f8']

    plt.figure(figsize=(17, 8))
    index = np.arange(len(amino_acids))

    sub_size = round(0.8 * max_font_size)

    bar_width = 0.35

    is_comparing = ref_table != NO_COMP_VALUE # Set "check ref table" flag

    for i, plot_data in enumerate(anal_raw): # plot_data is [aa, [codons], [frequencies]]
        aa = plot_data[0]
        codons = plot_data[1]
        frequencies = plot_data[2]

        # Bar positions
        if is_comparing:
            input_pos = index[i] - bar_width / 2
            ref_pos = index[i] + bar_width / 2
            current_bar_width = bar_width
        else: # Not comparing
            input_pos = index[i]
            ref_pos = None
            current_bar_width = bar_width * 1.5

        # Plot: Analyzed Sequence (using codons and frequencies directly from plot_data)
        bottom = 0
        for j, (codon, freq) in enumerate(zip(codons, frequencies)):
            color_index = j % len(colors_list)
            color = colors_list[color_index]

            plt.bar(input_pos, freq, current_bar_width, bottom=bottom, color=color, edgecolor='grey', linewidth=0.5)

            y_position = bottom + freq / 2
            is_max_freq = (freq == max(frequencies)) # Boolean flag for max frequency
            fontsize = max_font_size if is_max_freq else sub_size
            fontweight = 'bold' if is_max_freq else 'normal'
            rotation = 90 if is_max_freq else 0
            if max_font_size > 0:
                plt.text(input_pos, y_position, codon, ha='center', va='center',
                             fontsize=fontsize, color='white', fontweight=fontweight, rotation=rotation)
            bottom += freq

        # Plot: Reference Table Graph
        if is_comparing:
            ref_codons = ref_table[aa]['codons']
            ref_frequencies = ref_table[aa]['frequencies']
            ref_bottom = 0
            for j, (codon, freq) in enumerate(zip(ref_codons, ref_frequencies)):
                color_index = j % len(colors_list_ref)
                color = colors_list_ref[color_index]
                
                plt.bar(ref_pos, freq, current_bar_width, bottom=ref_bottom, color=color,
                        hatch='///', edgecolor='grey', linewidth=0.5)

                y_position = ref_bottom + freq / 2
                is_max_freq = (freq == max(ref_frequencies)) # Boolean flag for max frequency in reference
                #fontsize = 8 if is_max_freq else 7
                fontsize = max_font_size if is_max_freq else sub_size
                fontweight = 'bold' if is_max_freq else 'normal'
                rotation = 90 if is_max_freq else 0
                if max_font_size > 0:
                    plt.text(ref_pos, y_position, codon, ha='center', va='center',
                            fontsize=fontsize, color='black', fontweight=fontweight, rotation=rotation)
                ref_bottom += freq

    # --- Legend Handles ---
    legend_handles = []
    analyzed_patch = Patch(facecolor = 'white', edgecolor = 'black', label = 'Analyzed Sequence')
    legend_handles.append(analyzed_patch)

    if is_comparing: # Handle for reference table
        reference_patch = Patch(facecolor = 'white', edgecolor = 'black', hatch = '///', 
                                label = f'Reference: {ref_table_name}')
        legend_handles.append(reference_patch)

    # Time to plot
    plt.xlabel('Amino Acids')
    plt.ylabel('Relative Codon Frequency')
    plt.title(project)
    plt.xticks(index, amino_acids, rotation=45, ha='center')
    plt.ylim(0, 1.05)
    plt.legend(handles = legend_handles)
    plt.tight_layout()
    plt.show()

# (End of analyze_sequence function)

## GUI Handling

In [None]:
# Create Widgets

# Project Name / Plot Title
project_name = widgets.Text(
    value = 'Codon Usage Analysis',
    placeholder = 'Enter project name',
    description = 'Title:',
    layout = widgets.Layout(width='95%')
)

# Text area for sequence input
seq_input = widgets.Textarea(
    placeholder = 'Enter nucleic acid sequence here (A, C, G, T or U)',
    description = 'Sequence:',
    layout = widgets.Layout(height='150px', width='95%')
)

# --- Dropdown for Reference Choice ---
# Use a special string value instead of None for "No Comparison"
NO_COMP_VALUE = "__NO_COMPARISON__" # Define the placeholder value
dropdown_options = [( "No Comparison", NO_COMP_VALUE )] + list(reference_tables.items())

ref_select = widgets.Dropdown(
    options = dropdown_options,
    value = NO_COMP_VALUE, # Set default value to the placeholder string
    description = 'Reference:',
)

# --- Font Size ---
fontsize_input = widgets.IntText(
    value = 12, # Default font size for max frequency codon
    min = 0,   # Off
    step = 1,
    description = 'Label Size:',
    layout = widgets.Layout(width='200px')
)

# Button to trigger analysis
run_button = widgets.Button(
    description = 'Run Analysis',
    button_style = 'success',
    tooltip = 'Click to analyze the sequence',
    icon = 'cogs'
)

# Output widget
output_area = widgets.Output(layout=widgets.Layout(border='1px solid black', padding='5px', overflow='auto'))

# Button Click Handler

def on_run_button_clicked(b):
    with output_area:
        clear_output(wait=True)
        print("Starting Analysis..." + "\n")
        try:
            # Get name of project / title of plot
            pron = project_name.value
            # Get values from widgets
            sequence = seq_input.value
            # ref_select.value will be the dictionary (h_sapiens/e_coli/whatever else is added) or None
            selected_ref_dict = ref_select.value
            # ref_select.label will be the string ("H. sapiens", "E. coli", [...] or "No Comparison")
            selected_ref_name = ref_select.label
            # Font sizing
            szyslak = fontsize_input.value

            # Pass the selected dictionary (or None) and the label to the ~Main Function~
            analyze_sequence(pron, sequence, szyslak, selected_ref_dict, selected_ref_name)
            print("\nAnalysis Complete.")
        except Exception as e:
            print(f"\nAn unexpected error occurred during analysis: {e}")
            import traceback
            traceback.print_exc()

# Showtime!
run_button.on_click(on_run_button_clicked)

## Output

In [None]:
# Display Widgets

input_widgets = widgets.VBox([project_name, seq_input, ref_select, fontsize_input, run_button])

display(input_widgets, output_area)