In [15]:
# Cell 1: Essential Imports, Global Constants, Initialisation of Data Stores, and Matplotlib Backend Setup

# I'm placing all my required imports here at the very beginning of the script.
# This ensures all modules are available before I start defining classes and functions.
import numpy as np             # My general-purpose numerical computing library, essential for array operations and calculations.
import random                  # A useful module for generating random numbers, crucial for my simulations of genetic processes.
import itertools               # For creating efficient iterators, such as the colour cycling for plotting different generations.
import matplotlib.pyplot as plt # This is the core plotting library I'm using to create all my visualisations of hybrid index and heterozygosity.
from matplotlib.lines import Line2D # Specifically importing Line2D from matplotlib.lines, as I need it to create custom legend entries for my plots.
from typing import List, Tuple, Dict # I use type hints like List and Tuple to make my code more readable and robust.
import os                      # Essential for interacting with the operating system, particularly for managing file paths.
import matplotlib              # The main Matplotlib module itself, needed here to explicitly set the backend for interactive plots.
import re                      # The 're' module handles regular expressions, which I'll use for parsing generation names.
import mplcursors              # This is a fantastic third-party library that adds interactive data cursors to my Matplotlib plots.
import csv
import time

# --- IMPORTANT: I'm setting the Matplotlib backend for interactivity here ---
# This block is crucial for ensuring my plots are interactive within environments like Jupyter.
# I'm attempting to use 'QtAgg' first as it's often preferred, then falling back to 'TkAgg'.
try:
    matplotlib.use('QtAgg')
    print("I'm using Matplotlib backend: QtAgg")
except ImportError:
    matplotlib.use('TkAgg')
    print("I'm falling back to Matplotlib backend: TkAgg")
except Exception as e:
    print(f"I couldn't set an interactive backend, an error occurred: {e}. Falling back to Matplotlib's default.")
    pass # I'll let Matplotlib choose its default if both preferred backends fail.


# I'm defining constants for the alleles used in my simulation. This makes the code
# more readable and easier to modify if I ever change the allele representation.
MAGENTA = 'M'  # allele label
YELLOW = 'Y'   # allele label

# Mapping allele labels to numeric values:
allele_to_num = {MAGENTA: 1, YELLOW: 0}

def genotype_to_numeric(genotype: list[str]) -> list[int]:
    """Convert list of allele letters to numeric values."""
    return [allele_to_num.get(allele, -1) for allele in genotype]

# I'm setting up global lists to store data generated throughout the simulation.
# This allows me to collect information from various parts of the process.
genetic_data_records = []         # This list will store detailed genotype information for all individuals.
chromatid_recombination_records = [] # This list will log details of recombination events for each chromatid.

# I need a global counter to assign a unique ID to each individual created in the simulation.
# It ensures every simulated creature has a distinct identifier.
individual_id_counter = 1

I'm using Matplotlib backend: QtAgg


In [16]:
# Cell 2: Genetic Data Structures - Chromosomes and Individuals

# This cell defines the fundamental data structures I'm using to represent genetic material
# and individuals within my simulation.

class Chromosome:
    def __init__(self, alleles: List[str]):
        """
        I use this class to represent a single chromosome strand. It's essentially a linear sequence of alleles.

        Args:
            alleles (List[str]): A list of strings, where each string represents an allele at a locus
                                 (e.g., ['M', 'M', 'Y', ...]).
        """
        self.alleles = alleles # Stores the sequence of alleles for this chromosome.

    def __repr__(self) -> str:
        """
        A string representation for the Chromosome object, useful for debugging.
        It shows a snippet of the alleles for brevity.
        """
        # I take the first 10 alleles to provide a quick look without printing the whole chromosome.
        snippet = ''.join(self.alleles[:10]) if self.alleles else ''
        return f"Chr({snippet}...)" # Format: Chr(MMY...)


class DiploidChromosomePair:
    def __init__(self, chromatid1: Chromosome, chromatid2: Chromosome):
        """
        This class represents a pair of homologous chromosomes, as found in a diploid organism.
        Each chromosome in the pair is an instance of my 'Chromosome' class.

        Args:
            chromatid1 (Chromosome): The first chromatid (homolog) in the pair.
            chromatid2 (Chromosome): The second chromatid (homolog) in the pair.
        """
        self.chromatid1 = chromatid1 # Stores the first chromosome of the pair.
        self.chromatid2 = chromatid2 # Stores the second chromosome of the pair.

    def __repr__(self) -> str:
        """
        A string representation for the DiploidChromosomePair, showing both chromatids.
        """
        # I format it to show each chromatid on a new line for better readability.
        return f"Pair(\n  {self.chromatid1}\n  {self.chromatid2}\n)"


class Individual:
    def __init__(self, num_chromosomes: int, num_loci_per_chromosome: int):
        global individual_id_counter
        self.id = individual_id_counter
        individual_id_counter += 1

        self.num_chromosomes = num_chromosomes
        self.num_loci_per_chromosome = num_loci_per_chromosome
        self.diploid_chromosome_pairs: List[DiploidChromosomePair] = []

    def get_all_numeric_genotypes(self) -> List[int]:
        all_numeric = []
        for pair in self.diploid_chromosome_pairs:
            alleles_chromatid1 = pair.chromatid1.alleles
            alleles_chromatid2 = pair.chromatid2.alleles
            for i in range(self.num_loci_per_chromosome):
                pair_sorted = sorted([alleles_chromatid1[i], alleles_chromatid2[i]])
                if pair_sorted == [MAGENTA, MAGENTA]:
                    all_numeric.append(2)
                elif pair_sorted == [YELLOW, YELLOW]:
                    all_numeric.append(0)
                else:
                    all_numeric.append(1)
        return all_numeric

    def calculate_hybrid_index(self) -> float:
        all_numeric_genotypes = self.get_all_numeric_genotypes()
        total_loci = len(all_numeric_genotypes)
        if total_loci == 0:
            return 0.0
        total_possible_m_alleles = 2 * total_loci
        sum_of_m_alleles = sum(all_numeric_genotypes)
        return sum_of_m_alleles / total_possible_m_alleles

    def calculate_heterozygosity(self) -> float:
        all_numeric_genotypes = self.get_all_numeric_genotypes()
        total_loci = len(all_numeric_genotypes)
        if total_loci == 0:
            return 0.0
        heterozygous_count = all_numeric_genotypes.count(1)
        return heterozygous_count / total_loci

    def get_chromatid_block_data(self):
        all_chromatid_data = []
        chromatid_labels = ['A', 'B']

        for chr_idx, diploid_pair in enumerate(self.diploid_chromosome_pairs):
            chromatids_in_pair = [diploid_pair.chromatid1, diploid_pair.chromatid2]

            for i, chromatid in enumerate(chromatids_in_pair):
                chromatid_alleles = chromatid.alleles
                junctions, lengths, alleles = self._analyse_single_chromatid(chromatid_alleles)

                all_chromatid_data.append({
                    'individual_id': self.id,
                    'diploid_chr_id': chr_idx + 1,
                    'chromatid_in_pair': chromatid_labels[i],
                    'total_junctions': junctions,
                    'block_lengths': lengths,
                    'block_alleles': alleles
                })
        return all_chromatid_data

    def _analyse_single_chromatid(self, alleles: List[str]) -> Tuple[int, List[int], List[str]]:
        if not alleles:
            return 0, [], []

        block_lengths = []
        block_alleles = []

        for allele, group in itertools.groupby(alleles):
            group_list = list(group)
            block_lengths.append(len(group_list))
            block_alleles.append(allele)

        junctions = len(block_lengths) - 1 if block_lengths else 0
        return junctions, block_lengths, block_alleles

In [17]:
def meiosis_with_recombination(
    diploid_pair: DiploidChromosomePair,
    recomb_event_probabilities: dict,
    recomb_probabilities: list
) -> Chromosome:
    """
    Simulates meiosis with a variable number of recombination events for one chromosome pair.

    Args:
        diploid_pair (DiploidChromosomePair): The pair of homologous chromatids.
        recomb_event_probabilities (dict): Probability for 0, 1, or 2 recombination events, e.g., {0: 0.1, 1: 0.85, 2: 0.05}.
        recomb_probabilities (list): Position-dependent probabilities for recombination along loci (length = loci per chromosome + 1).

    Returns:
        Chromosome: A recombinant chromosome after meiosis.
    """
    loci_len = len(diploid_pair.chromatid1.alleles)

    # Decide how many recombination events happen (0, 1, or 2)
    n_events = random.choices(
        population=[0, 1, 2],
        weights=[recomb_event_probabilities.get(i, 0) for i in [0, 1, 2]],
        k=1
    )[0]

    possible_positions = list(range(1, loci_len))  # possible breakpoints between loci
    chosen_positions = []

    if n_events > 0:
        # Use recomb_probabilities to weight positions (skip index 0)
        weights = recomb_probabilities[1:loci_len]
        weights_sum = sum(weights)

        if weights_sum == 0:
            # If all weights zero, choose breakpoints randomly without weights
            chosen_positions = sorted(random.sample(possible_positions, n_events))
        else:
            # Weighted random sampling without replacement
            chosen_positions = []
            while len(chosen_positions) < n_events:
                pos = random.choices(possible_positions, weights=weights, k=1)[0]
                if pos not in chosen_positions:
                    chosen_positions.append(pos)
            chosen_positions.sort()

        parent1_alleles = diploid_pair.chromatid1.alleles
        parent2_alleles = diploid_pair.chromatid2.alleles

        recombinant_alleles = []
        last_pos = 0
        source = 0  # 0 means take from parent1, 1 means from parent2
        breakpoints = chosen_positions + [loci_len]

        for pos in breakpoints:
            if source == 0:
                recombinant_alleles.extend(parent1_alleles[last_pos:pos])
            else:
                recombinant_alleles.extend(parent2_alleles[last_pos:pos])

            source = 1 - source  # switch source
            last_pos = pos

        return Chromosome(recombinant_alleles)

    else:
        # No recombination: just copy one chromatid (e.g. chromatid1)
        return Chromosome(diploid_pair.chromatid1.alleles)


In [18]:
# Cell 4: Data Recording Functions
# This cell contains helper functions I use to record the detailed genetic and recombination
# data of individuals into my global lists, as well as utilities for extracting key metrics.

def record_individual_genome(individual: Individual, generation_label: str):
    """
    I use this function to record the full genotype of each locus for every chromosome pair
    within a given individual. This data is then appended to the global `genetic_data_records` list.

    Each entry in `genetic_data_records` is a dictionary providing:
      - 'generation': The specific generation label (e.g., 'F2', 'BC1A').
      - 'individual_id': The unique identifier for the individual.
      - 'diploid_chr_id': The chromosome pair number (1-based for clarity).
      - 'locus_position': The position index of the locus along the chromosome (0-based).
      - 'genotype': A string representing the alleles at this locus, e.g., 'M|Y'.

    Args:
        individual (Individual): The 'Individual' object whose genome I want to record.
        generation_label (str): A string label to associate with the current generation.
    """
    # Iterate through each diploid chromosome pair of the individual.
    for chr_idx, pair in enumerate(individual.diploid_chromosome_pairs):
        # Then, iterate through each locus on the chromosome.
        for locus_idx in range(individual.num_loci_per_chromosome):
            # Extract the alleles from both chromatids at the current locus.
            allele_a = pair.chromatid1.alleles[locus_idx]
            allele_b = pair.chromatid2.alleles[locus_idx]
            # Form a standard genotype string (e.g., "M|Y").
            genotype_str = f"{allele_a}|{allele_b}"
            
            # Append a dictionary containing all the relevant details for this locus to my global records.
            genetic_data_records.append({
                'generation': generation_label,
                'individual_id': individual.id,
                'diploid_chr_id': chr_idx + 1, # Use 1-based indexing for chromosome ID.
                'locus_position': locus_idx,
                'genotype': genotype_str
            })


def record_chromatid_recombination(individual: Individual, generation_label: str):
    """
    This function records the detailed recombination block data for an individual's chromatids.
    It calls the individual's own method to get the block data, then enriches it with the
    generation label before appending it to the global `chromatid_recombination_records` list.

    Args:
        individual (Individual): The 'Individual' object whose recombination data I want to record.
        generation_label (str): A string label to associate with the current generation.
    """
    # I get the recombination block data for all chromatids of the individual.
    chromatid_data = individual.get_chromatid_block_data()
    # For each record (which represents one chromatid's data), I add the generation label.
    for record in chromatid_data:
        record['generation'] = generation_label
        # Then, I append the enriched record to my global recombination records list.
        chromatid_recombination_records.append(record)

In [19]:
# Cell 5: Population Creation Functions and Statistics Utility

# This cell provides helper functions for setting up initial populations
# and for calculating summary statistics of any given population.

def create_pure_individual(num_chromosomes: int, num_loci_per_chr: int, allele_type: str) -> Individual:
    """
    I use this function to create a single individual that is 'pure' for a specific allele type.
    This means all its loci across all its chromosomes will be homozygous for the given allele.
    This is typically used for my initial parental populations (P_A or P_B).

    Args:
        num_chromosomes (int): The total number of diploid chromosome pairs for this individual.
        num_loci_per_chr (int): The number of genetic loci on each chromosome.
        allele_type (str): The allele ('M' or 'Y') that will fill all loci.

    Returns:
        Individual: A newly created 'Individual' object, homozygous for the specified allele at every locus.
    """
    # I create a new Individual instance with the specified chromosome and locus counts.
    individual = Individual(num_chromosomes, num_loci_per_chr)
    # For each chromosome pair the individual has:
    for _ in range(num_chromosomes):
        # I create a list of alleles where every locus is filled with the specified 'allele_type'.
        chromosome_alleles = [allele_type] * num_loci_per_chr
        # Both chromatids in the pair will be identical, ensuring homozygosity.
        chromatid1 = Chromosome(chromosome_alleles[:]) # I use slicing [:] to create a new list copy.
        chromatid2 = Chromosome(chromosome_alleles[:]) # This prevents unexpected modifications if chromosome_alleles is changed later.
        # I add the newly formed homozygous diploid chromosome pair to the individual.
        individual.diploid_chromosome_pairs.append(DiploidChromosomePair(chromatid1, chromatid2))
    return individual # Return the fully constructed pure individual.


def create_pure_population(
    num_individuals: int,
    num_chromosomes: int,
    num_loci_per_chr: int,
    allele_type: str
) -> List[Individual]:
    """
    This function allows me to easily create an entire population of 'pure' individuals.
    All individuals in this population will be homozygous for the same specified allele type.

    Args:
        num_individuals (int): The number of individuals I want in this pure population.
        num_chromosomes (int): The number of chromosomes each individual in the population will have.
        num_loci_per_chr (int): The number of loci on each chromosome for individuals in this population.
        allele_type (str): The allele ('M' or 'Y') that all individuals in this population will be homozygous for.

    Returns:
        List[Individual]: A list containing all the newly created pure individuals.
    """
    # I use a list comprehension to efficiently create multiple pure individuals
    # by repeatedly calling my 'create_pure_individual' function.
    return [create_pure_individual(num_chromosomes, num_loci_per_chr, allele_type) for _ in range(num_individuals)]


def create_F1_population(
    pure_pop_A: List[Individual],
    pure_pop_B: List[Individual],
    recomb_event_probabilities: dict,
    recomb_probabilities: List[float]
) -> List[Individual]:
    """
    I use this function to generate the first filial (F1) hybrid population.
    This is achieved by crossing paired individuals from two pure parental populations.
    Each F1 individual will receive one recombinant chromatid from a parent from 'pure_pop_A'
    and one from a parent from 'pure_pop_B'.

    Args:
        pure_pop_A (List[Individual]): The pure parental population (e.g., all 'M' alleles).
        pure_pop_B (List[Individual]): The other pure parental population (e.g., all 'Y' alleles).
        recomb_event_probabilities (dict): The probability distribution for the number of recombination events per chromosome.
        recomb_probabilities (List[float]): The position-dependent probabilities for recombination along chromosomes.

    Raises:
        ValueError: If the input parental populations are not of the same size, as pairing
                    for crosses would be ambiguous.

    Returns:
        List[Individual]: A list containing all the newly created F1 hybrid individuals.
    """
    # I first check that the parental populations are of equal size, which is necessary for paired crosses.
    if len(pure_pop_A) != len(pure_pop_B):
        raise ValueError("Error: Pure populations must be the same size to create F1 population via paired crosses.")

    f1_population = [] # Initialise an empty list to store the F1 individuals.

    # I iterate through the parental populations, pairing individuals by their index.
    for i in range(len(pure_pop_A)):
        parent_A = pure_pop_A[i] # Get one parent from population A.
        parent_B = pure_pop_B[i] # Get the corresponding parent from population B.

        # I create a new 'Individual' instance for the F1 offspring.
        # It will have the same chromosome and locus structure as its parents.
        child = Individual(parent_A.num_chromosomes, parent_A.num_loci_per_chromosome)
        child.diploid_chromosome_pairs = [] # I explicitly clear this list, though it should be empty on new creation.

        # For each chromosome pair, I simulate gamete formation and combine them for the offspring.
        for chr_idx in range(parent_A.num_chromosomes):
            # I get the specific diploid chromosome pair from each parent.
            chr_A_pair = parent_A.diploid_chromosome_pairs[chr_idx]
            chr_B_pair = parent_B.diploid_chromosome_pairs[chr_idx]

            # I simulate meiosis to get one recombinant haploid chromatid from each parent.
            haploid_A = meiosis_with_recombination(chr_A_pair, recomb_event_probabilities, recomb_probabilities)
            haploid_B = meiosis_with_recombination(chr_B_pair, recomb_event_probabilities, recomb_probabilities)

            # I then combine these two haploid chromatids to form a new diploid pair for the F1 child.
            child.diploid_chromosome_pairs.append(DiploidChromosomePair(haploid_A, haploid_B))

        # After creating the F1 child, I immediately record its genetic and recombination data.
        record_individual_genome(child, 'F1')
        record_chromatid_recombination(child, 'F1')

        f1_population.append(child) # Add the new F1 individual to the list.
    return f1_population # Return the complete F1 population.


def population_stats(pop: List[Individual]) -> dict:
    """
    I use this helper function to calculate key summary statistics for a given population of 'Individual' objects.
    This helps me quickly understand the genetic composition of each generation.

    Args:
        pop (List[Individual]): A list of individuals in the population.

    Returns:
        dict: Summary stats including mean and std deviation of hybrid index (HI),
              mean and std deviation of heterozygosity (HET), and population size.
    """
    his = [ind.calculate_hybrid_index() for ind in pop]   # I calculate the Hybrid Index for each individual.
    hets = [ind.calculate_heterozygosity() for ind in pop] # I calculate the Heterozygosity for each individual.

    # I return a dictionary with the calculated statistics. I use conditional checks (if his/hets else 0)
    # to prevent errors if a population happens to be empty.
    return {
        'mean_HI': np.mean(his) if his else 0,
        'std_HI': np.std(his) if his else 0,
        'mean_HET': np.mean(hets) if hets else 0,
        'std_HET': np.std(hets) if hets else 0,
        'count': len(pop)
    }

In [20]:
# Cell 6: Breeding Plan Functions

# This cell contains functions to systematically build my breeding plans,
# which define how different generations will be crossed.

def build_forward_generations(base_name: str, start_gen: int, end_gen: int) -> List[Tuple[str, str, str]]:
    """
    I use this function to create a breeding plan for sequential forward generations (e.g., F1, F2, F3...).
    The process starts from 'start_gen' and goes up to 'end_gen' (inclusive).
    The very first generation (specified by 'start_gen') is always a cross between two pure parental populations ('P_A' and 'P_B').
    Subsequent generations in this forward sequence are then bred by crossing individuals from the *previous* generation amongst themselves.

    Args:
        base_name (str): The prefix for the generation names (e.g., "F" for Filial generations).
        start_gen (int): The starting generation number (e.g., 1 for F1).
        end_gen (int): The final generation number to include (e.g., 5 for F5).

    Returns:
        List[Tuple[str, str, str]]: A list of tuples, where each tuple represents a planned cross:
                                    (new_generation_label, parent1_label, parent2_label).
    """
    plan = [] # Initialise an empty list to store my breeding plan.
    for i in range(start_gen, end_gen + 1):
        current_gen_label = f"{base_name}{i}" # Construct the label for the current generation, e.g., "F1", "F2".
        if i == start_gen:
            # For the first generation in the sequence, I'm crossing the pure parental populations.
            plan.append((current_gen_label, 'P_A', 'P_B'))
        else:
            # For subsequent generations, I cross individuals from the previous generation with themselves.
            previous_gen_label = f"{base_name}{i-1}"
            plan.append((current_gen_label, previous_gen_label, previous_gen_label))
    return plan # Return the complete breeding plan.


def build_backcross_generations(
    base_name: str,
    filial_bases: List[str],
    pure_pop_label: str
) -> List[Tuple[str, str, str]]:
    """
    This function helps me build a backcross generation plan. Backcrossing involves crossing a hybrid
    individual back to one of its parental types.

    Args:
        base_name (str): The prefix for backcross generation names (e.g., "BC" for Backcross).
        filial_bases (List[str]): A list of filial generation names (e.g., ['F1', 'F2']) that I want to backcross.
        pure_pop_label (str): The label of the pure parental population (e.g., "P_A" or "P_B")
                              that the filial generations will be crossed with.

    Returns:
        List[Tuple[str, str, str]]: A list of backcross generation crosses, where each tuple is:
                                    (backcross_label, filial_parent_label, pure_population_parent_label).
                                    Example: [('BC1A', 'F1', 'P_A'), ('BC2A', 'F2', 'P_A'), ...]
    """
    plan = [] # Initialise an empty list to hold my backcross plans.

    # I iterate through the filial generation names, using 'enumerate' to get an index starting from 1.
    for i, filial_gen_label in enumerate(filial_bases, start=1):
        # I construct the backcross generation label, e.g., "BC1A", "BC2B".
        # The last character of the 'pure_pop_label' ('A' or 'B') is appended for clarity.
        backcross_label = f"{base_name}{i}{pure_pop_label[-1]}"

        # I append the planned cross to the list: (new backcross generation, filial parent, pure parent).
        plan.append((backcross_label, filial_gen_label, pure_pop_label))

    return plan # Return the complete list of backcross generation plans.

In [21]:
# Cell 7: Simulating Genetic Crosses

# This cell contains the primary function I use to simulate a genetic cross between two populations,
# generating a new set of offspring.

def run_genetic_cross(
    parents_pop_A: List[Individual],
    parents_pop_B: List[Individual],
    num_offspring_to_create: int,
    generation_label: str,
    num_chromosomes_for_offspring: int,
    recomb_event_probabilities: dict,
    recomb_probabilities: List[float]
) -> List[Individual]:
    """
    I use this function to simulate a genetic cross, where individuals from two distinct parental
    populations (pop_A and pop_B) mate to produce a specified number of offspring.

    Args:
        parents_pop_A (List[Individual]): The first group of parental individuals available for mating.
        parents_pop_B (List[Individual]): The second group of parental individuals available for mating.
        num_offspring_to_create (int): The total number of new offspring individuals I want to generate.
        generation_label (str): A descriptive label for the new generation being created (e.g., "F2", "BC1A").
        num_chromosomes_for_offspring (int): The number of diploid chromosome pairs each new offspring will have.
        recomb_event_probabilities (dict): A probability distribution that dictates how many
                                           recombination events (crossovers) occur on a chromosome during meiosis.
                                           For example: {0: 0.1, 1: 0.85, 2: 0.05} means 10% chance of 0 crossovers,
                                           85% chance of 1, and 5% chance of 2.
        recomb_probabilities (List[float]): A list or array of probabilities for recombination occurring
                                           at each specific locus position along a chromosome.

    Returns:
        List[Individual]: A list containing all the newly created offspring individuals from this cross.

    My Procedure:
    1. For each offspring I need to create:
       a. I randomly select one parent from `parents_pop_A` and another from `parents_pop_B`.
       b. I then create a new `Individual` object to represent the child, setting its chromosome and locus counts.
       c. For each of the specified number of chromosomes:
          i. I retrieve the corresponding diploid chromosome pair from each of the selected parents.
          ii. I simulate meiosis with recombination for *each* parental chromosome pair,
              generating a single recombinant haploid chromatid from each parent.
          iii. These two recombinant haploid chromatids (one from each parent) are then combined
               to form a new diploid chromosome pair for the offspring.
       d. Finally, I record the detailed genotype and recombination data for this new offspring
          using my helper functions, and add the offspring to my results list.
    """
    offspring = [] # Initialise an empty list to store the new individuals.

    # I loop for the specified number of offspring I want to produce.
    for _ in range(num_offspring_to_create):
        # I randomly select one parent from each of the two parental populations.
        parent_A = random.choice(parents_pop_A)
        parent_B = random.choice(parents_pop_B)

        # I create a new Individual instance for the child. It inherits the number of loci
        # per chromosome from its parents (assuming they are consistent).
        child = Individual(num_chromosomes_for_offspring, parent_A.num_loci_per_chromosome)

        # Now, for each chromosome pair the child will have, I simulate the genetic inheritance.
        for chr_idx in range(num_chromosomes_for_offspring):
            # I get the specific diploid chromosome pair from each parent.
            diploid_pair_A = parent_A.diploid_chromosome_pairs[chr_idx]
            diploid_pair_B = parent_B.diploid_chromosome_pairs[chr_idx]

            # I call my 'meiosis_with_recombination' function to generate a recombinant haploid chromatid
            # from each parent's chromosome pair. This simulates the formation of gametes.
            haploid_from_A = meiosis_with_recombination(diploid_pair_A, recomb_event_probabilities, recomb_probabilities)
            haploid_from_B = meiosis_with_recombination(diploid_pair_B, recomb_event_probabilities, recomb_probabilities)

            # I then combine these two haploid chromatids (one from each parent) to form a new diploid
            # chromosome pair for the child.
            child.diploid_chromosome_pairs.append(DiploidChromosomePair(haploid_from_A, haploid_from_B))

        # Once the child's full set of chromosomes is assembled, I record its genetic data
        # and recombination events using my global recording functions.
        record_individual_genome(child, generation_label)
        record_chromatid_recombination(child, generation_label)

        # Finally, I add the newly created child to my list of offspring for this cross.
        offspring.append(child)

    return offspring # I return the complete list of offspring.

In [22]:
def calculate_hi_het_for_population(population: List[Individual]) -> List[Dict[str, float]]:
    data = []
    for indiv in population:
        hi = indiv.calculate_hybrid_index()
        het = indiv.calculate_heterozygosity()
        data.append({'id': indiv.id, 'HI': hi, 'HET': het})
    return data

def simulate_generations(
    initial_pop_A: list = None,
    initial_pop_B: list = None,
    generation_plan: list = None,
    num_offspring_per_cross: int = 2,
    num_chromosomes: int = 2,
    recomb_event_probabilities: dict = None,
    recomb_probabilities: list = None,
    existing_populations: dict = None,
    verbose: bool = False,
):
    # Initialise populations dict (existing or new)
    populations = existing_populations if existing_populations is not None else {}

    # NEW: Initialise dict to store HI and HET data for each generation
    all_generations_data = {}

    # Add initial pure populations if provided, and record HI/HET for them
    if initial_pop_A is not None and 'P_A' not in populations:
        populations['P_A'] = initial_pop_A
        for ind in initial_pop_A:
            record_individual_genome(ind, 'P_A')
            record_chromatid_recombination(ind, 'P_A')
        all_generations_data['P_A'] = calculate_hi_het_for_population(initial_pop_A)

    if initial_pop_B is not None and 'P_B' not in populations:
        populations['P_B'] = initial_pop_B
        for ind in initial_pop_B:
            record_individual_genome(ind, 'P_B')
            record_chromatid_recombination(ind, 'P_B')
        all_generations_data['P_B'] = calculate_hi_het_for_population(initial_pop_B)

    # Check for generation plan
    if generation_plan is None:
        print("Warning: No generation plan provided. Returning existing populations.")
        return populations, all_generations_data

    # Loop over planned generations to simulate crosses
    for gen_info in generation_plan:
        if len(gen_info) == 1:
            continue  # Skip if only generation label is given (no cross info)

        gen_name = gen_info[0]
        parents_names = gen_info[1:]

        # Check parents exist
        for p_name in parents_names:
            if p_name not in populations:
                raise ValueError(f"Parent population '{p_name}' not found for generation '{gen_name}'.")

        parents_pop_A_for_cross = populations[parents_names[0]]
        parents_pop_B_for_cross = populations[parents_names[1]]

        # Run the cross to get new generation
        new_pop = run_genetic_cross(
            parents_pop_A_for_cross,
            parents_pop_B_for_cross,
            num_offspring_per_cross,
            gen_name,
            num_chromosomes,
            recomb_event_probabilities,
            recomb_probabilities
        )

        # Store new population
        populations[gen_name] = new_pop

        # NEW: calculate and store HI/HET for this generation
        all_generations_data[gen_name] = calculate_hi_het_for_population(new_pop)

        # Verbose output
        if verbose:
            stats = population_stats(new_pop)
            print(f"{gen_name} created from parents {parents_names[0]} and {parents_names[1]} | "
                  f"Count: {len(new_pop)} | Mean HI: {stats['mean_HI']:.3f} (±{stats['std_HI']:.3f}), "
                  f"Mean HET: {stats['mean_HET']:.3f} (±{stats['std_HET']:.3f})")
            print(f"Added '{gen_name}' to populations. Current population keys: {list(populations.keys())}")

    return populations, all_generations_data

In [23]:
# Cell 9: Main Simulation Execution

# This cell defines all the parameters for my simulation and then orchestrates
# the multi-generational genetic crosses using the functions defined previously.

# --- Simulation Parameters ---
num_individuals_per_pure_pop = 8   # I'm setting the number of individuals in each pure parental population (P_A and P_B).
num_offspring_per_cross = 8        # I specify how many offspring each mating event should produce.
num_chromosomes = 10               # I'm defining the number of diploid chromosome pairs each individual will have.
num_loci_per_chr = 100             # I'm specifying the number of genetic loci on each chromosome.

# --- Recombination Probabilities ---
# This dictionary defines the probability of 0, 1, or 2 recombination events (crossovers) per chromosome.
# Here, I've set it to always have exactly one recombination event (probability 1 for 1 event, 0 for others).
recomb_event_probabilities = {0: 0, 1: 1, 2: 0}
# This list defines the probability of recombination at each locus.
# I'm setting it to be uniform across all loci (1) except for the very first position (0),
# meaning recombination can occur anywhere along the chromosome after the first locus.
recomb_probabilities = [0] + [1]*(num_loci_per_chr - 1) # Uniform recombination across loci.

# --- 1. Create Initial Pure Populations ---
# I'm creating the initial pure parental populations (P_A and P_B).
# P_A individuals are homozygous for the MAGENTA allele at all loci.
pure_pop_A = create_pure_population(num_individuals_per_pure_pop, num_chromosomes, num_loci_per_chr, MAGENTA)
# P_B individuals are homozygous for the YELLOW allele at all loci.
pure_pop_B = create_pure_population(num_individuals_per_pure_pop, num_chromosomes, num_loci_per_chr, YELLOW)

# I'm initialising my main 'populations' dictionary with the pure parental populations.
# These will be explicitly recorded by 'simulate_generations' if it's the first time seeing them.
populations = {
    'P_A': pure_pop_A,
    'P_B': pure_pop_B
}

# --- 2. Define Breeding Plans ---
# a. Forward Generations Plan (e.g., F1, F2, ..., F20)
# I'm setting up a plan to simulate 20 filial generations.
forward_plan = build_forward_generations("F", 1, 20)

# b. Backcross Generations Plan (e.g., BC1A, BC1B, etc.)
# I need to specify which filial generations from my forward crosses I'll use for backcrossing.
filial_generations_for_backcross = ['F1', 'F2', 'F3', 'F4']

# I'm building a plan to backcross these filial generations to pure population 'P_A'.
backcross_plan_A = build_backcross_generations("BC", filial_generations_for_backcross, "P_A")

# I'm also building a plan to backcross these same filial generations to pure population 'P_B'.
backcross_plan_B = build_backcross_generations("BC", filial_generations_for_backcross, "P_B")

# I combine all the individual plans into one comprehensive breeding plan.
# The order here is important as 'simulate_generations' processes them sequentially.
full_breeding_plan = forward_plan + backcross_plan_A + backcross_plan_B

print("--- Full Breeding Plan ---")
for cross in full_breeding_plan:
    print(cross)
print("--------------------------")

# --- 3. Run the Simulation ---
# I'm now initiating the main genetic simulation.
# I pass the initial pure populations, the full breeding plan, and all relevant parameters.
# 'verbose=True' will print summary statistics for each generation as it's created.

# Prepare your params dict first (make sure it includes everything needed)
params = {
    'initial_pop_A': pure_pop_A,
    'initial_pop_B': pure_pop_B,
    'generation_plan': full_breeding_plan,
    'num_offspring_per_cross': num_offspring_per_cross,
    'num_chromosomes': num_chromosomes,
    'recomb_event_probabilities': recomb_event_probabilities,
    'recomb_probabilities': recomb_probabilities,
    'existing_populations': {},  # empty at start or pre-existing if you want
    'verbose': True
}

start_time = time.perf_counter()

# Run the simulation, passing the parameters with unpacking
populations, all_generations_data = simulate_generations(**params)

end_time = time.perf_counter()
print(f"Simulation completed in {end_time - start_time:.2f} seconds")

print("\n--- Simulation Complete ---")
print(f"Total generations simulated: {len(populations)}")
print(f"Final populations available: {list(populations.keys())}")


--- Full Breeding Plan ---
('F1', 'P_A', 'P_B')
('F2', 'F1', 'F1')
('F3', 'F2', 'F2')
('F4', 'F3', 'F3')
('F5', 'F4', 'F4')
('F6', 'F5', 'F5')
('F7', 'F6', 'F6')
('F8', 'F7', 'F7')
('F9', 'F8', 'F8')
('F10', 'F9', 'F9')
('F11', 'F10', 'F10')
('F12', 'F11', 'F11')
('F13', 'F12', 'F12')
('F14', 'F13', 'F13')
('F15', 'F14', 'F14')
('F16', 'F15', 'F15')
('F17', 'F16', 'F16')
('F18', 'F17', 'F17')
('F19', 'F18', 'F18')
('F20', 'F19', 'F19')
('BC1A', 'F1', 'P_A')
('BC2A', 'F2', 'P_A')
('BC3A', 'F3', 'P_A')
('BC4A', 'F4', 'P_A')
('BC1B', 'F1', 'P_B')
('BC2B', 'F2', 'P_B')
('BC3B', 'F3', 'P_B')
('BC4B', 'F4', 'P_B')
--------------------------
F1 created from parents P_A and P_B | Count: 8 | Mean HI: 0.500 (±0.000), Mean HET: 1.000 (±0.000)
Added 'F1' to populations. Current population keys: ['P_A', 'P_B', 'F1']
F2 created from parents F1 and F1 | Count: 8 | Mean HI: 0.483 (±0.088), Mean HET: 0.351 (±0.077)
Added 'F2' to populations. Current population keys: ['P_A', 'P_B', 'F1', 'F2']
F3 create

In [34]:
def plot_hi_het_triangle_all_generations(all_generations_data: dict, p_a_stats: List[float] = None,
                                         p_b_stats: List[float] = None, f1_stats: List[float] = None,
                                         save_filename: str = None):
    """
    I use this function to plot Hybrid Index (HI) versus Heterozygosity (HET) for all my
    simulated generations. The plot includes interactive hover functionality to inspect
    individual data points, and it can save the plot with a dynamically generated filename.

    Args:
        all_generations_data (dict): A dictionary where keys are generation names (e.g., 'F2', 'BC1A')
                                     and values are dictionaries containing 'hi' and 'het' lists.
        p_a_stats (List[float], optional): `[hi_value, het_value]` for Parent A.
        p_b_stats (List[float], optional): `[hi_value, het_value]` for Parent B.
        f1_stats (List[float], optional): `[hi_value, het_value]` for the F1 generation.
        save_filename (str, optional): Custom filename for saving the plot.
    """
    fig, ax = plt.subplots(figsize=(8, 6))

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(1.2)
    ax.spines['bottom'].set_linewidth(1.2)

    ax.set_xlabel("Hybrid Index (proportion M alleles)", fontsize=12)
    ax.set_ylabel("Heterozygosity (proportion heterozygous loci)", fontsize=12)

    colors = [
        'blue', 'red', 'green', 'purple', 'orange', 'brown', 'pink', 'teal',
        'darkviolet', 'magenta', 'cyan', 'lime', 'gold', 'navy', 'maroon',
        'indigo', 'chocolate', 'darkgreen', 'darkred', 'darkblue',
        'darkgoldenrod', 'darkslategray', 'cornflowerblue', 'olivedrab',
        'peru', 'rosybrown', 'salmon', 'seagreen', 'sienna', 'darkkhaki',
        'mediumorchid', 'lightcoral', 'dodgerblue', 'forestgreen',
        'saddlebrown', 'slategray', 'steelblue', 'tan', 'tomato', 'violet'
    ]
    color_cycler = itertools.cycle(colors)

    generation_styles = {}
    legend_elements = []

    scatter_artists = []
    scatter_data_map = {}

    def add_gen_style_and_legend(name: str, color: str, alpha: float, marker: str = 'o', s: int = 20):
        style = {'color': color, 'alpha': alpha, 'marker': marker, 's': s}
        generation_styles[name] = style
        legend_elements.append(Line2D([0], [0], marker=style['marker'], color='w',
                                      markerfacecolor=style['color'], markersize=8,
                                      alpha=style['alpha'], label=name))

    # Gather all generation names
    all_present_gen_names = set()
    if p_a_stats is not None:
        all_present_gen_names.add('P_A')
    if p_b_stats is not None:
        all_present_gen_names.add('P_B')
    if f1_stats is not None:
        all_present_gen_names.add('F1')
    all_present_gen_names.update(all_generations_data.keys())

    def temp_sort_key_for_assignment(label: str):
        if label == 'P_A': return (0, label)
        if label == 'P_B': return (1, label)
        if label == 'F1': return (2, label)
        match_f = re.match(r'F(\d+)', label)
        if match_f:
            return (3, int(match_f.group(1)))
        elif label.startswith('F'):
            return (3, float('inf'), label)
        match_bc = re.match(r'BC(\d+)([A-Z]?)', label)
        if match_bc:
            num_part = int(match_bc.group(1))
            suffix_part = match_bc.group(2) if match_bc.group(2) else ''
            return (4, num_part, suffix_part)
        elif label.startswith('BC'):
            return (4, float('inf'), label)
        return (5, label)

    sorted_present_gen_names = sorted(list(all_present_gen_names), key=temp_sort_key_for_assignment)

    for gen_name in sorted_present_gen_names:
        if gen_name == 'P_A':
            add_gen_style_and_legend('P_A', 'black', 1.0, marker='o', s=50)
        elif gen_name == 'P_B':
            add_gen_style_and_legend('P_B', 'grey', 1.0, marker='o', s=50)
        elif gen_name == 'F1':
            add_gen_style_and_legend('F1', 'purple', 1.0, marker='o', s=50)
        elif gen_name.startswith('F'):
            color = next(color_cycler)
            add_gen_style_and_legend(gen_name, color, 1.0)
        elif gen_name.startswith('BC'):
            color = next(color_cycler)
            add_gen_style_and_legend(gen_name, color, 0.7)
        else:
            color = next(color_cycler)
            add_gen_style_and_legend(gen_name, color, 0.5)

    # Plot anchor points first
    if p_a_stats is not None:
        style = generation_styles['P_A']
        sc = ax.scatter(p_a_stats[0], p_a_stats[1], color=style['color'], alpha=style['alpha'],
                        marker=style['marker'], s=style['s'], zorder=5)
        scatter_artists.append(sc)
        scatter_data_map[sc] = [{'gen_name': 'P_A', 'hi': p_a_stats[0], 'het': p_a_stats[1]}]

    if p_b_stats is not None:
        style = generation_styles['P_B']
        sc = ax.scatter(p_b_stats[0], p_b_stats[1], color=style['color'], alpha=style['alpha'],
                        marker=style['marker'], s=style['s'], zorder=5)
        scatter_artists.append(sc)
        scatter_data_map[sc] = [{'gen_name': 'P_B', 'hi': p_b_stats[0], 'het': p_b_stats[1]}]

    if f1_stats is not None:
        style = generation_styles['F1']
        sc = ax.scatter(f1_stats[0], f1_stats[1], color=style['color'], alpha=style['alpha'],
                        marker=style['marker'], s=style['s'], zorder=5)
        scatter_artists.append(sc)
        scatter_data_map[sc] = [{'gen_name': 'F1', 'hi': f1_stats[0], 'het': f1_stats[1]}]

    # Main plotting loop for all generations
    for gen_name, values in all_generations_data.items():
        style = generation_styles.get(gen_name, {'color': 'lightgrey', 'alpha': 0.5, 'marker': 'o', 's': 20})

        if isinstance(values, list):
            hi_values = [v['HI'] for v in values if v['HI'] is not None]
            het_values = [v['HET'] for v in values if v['HET'] is not None]
        else:
            hi_values = [values['HI']] if values['HI'] is not None else []
            het_values = [values['HET']] if values['HET'] is not None else []

        # Filter out any None values
        filtered_hi = []
        filtered_het = []
        point_data_for_current_gen = []

        for i in range(len(hi_values)):
            if hi_values[i] is not None and het_values[i] is not None:
                filtered_hi.append(hi_values[i])
                filtered_het.append(het_values[i])
                point_data_for_current_gen.append({'gen_name': gen_name, 'hi': hi_values[i], 'het': het_values[i]})

        if filtered_hi:
            sc = ax.scatter(filtered_hi, filtered_het,
                            color=style['color'],
                            alpha=style['alpha'],
                            marker=style['marker'],
                            s=style['s'])
            scatter_artists.append(sc)
            scatter_data_map[sc] = point_data_for_current_gen
        else:
            print(f"I'm skipping plotting for {gen_name} as no valid HI/Het data was found.")

    # Draw triangle edges
    triangle_edges = [
        [(0.0, 0.0), (0.5, 1.0)],
        [(0.5, 1.0), (1.0, 0.0)],
        [(0.0, 0.0), (1.0, 0.0)]
    ]
    for (x0, y0), (x1, y1) in triangle_edges:
        ax.plot([x0, x1], [y0, y1], linestyle='-', color='gray', linewidth=1.5, alpha=0.7)

    ax.set_xlim(-0.05, 1.05)
    ax.set_ylim(-0.05, 1.05)
    ax.set_aspect('equal', adjustable='box')

    # Add legend
    ax.legend(handles=legend_elements, loc='upper right', fontsize=10)

    # Hover annotation setup
    annot = ax.annotate("", xy=(0, 0), xytext=(15, 15), textcoords="offset points",
                        bbox=dict(boxstyle="round", fc="w"),
                        arrowprops=dict(arrowstyle="->"))
    annot.set_visible(False)

    def update_annot(scatter, ind):
        pos = scatter.get_offsets()[ind["ind"][0]]
        annot.xy = pos
        texts = []
        for idx in ind["ind"]:
            if idx < len(scatter_data_map[scatter]):
                d = scatter_data_map[scatter][idx]
                texts.append(f"{d['gen_name']}:\nHI = {d['hi']:.3f}\nHET = {d['het']:.3f}")
        annot.set_text("\n\n".join(texts))
        annot.get_bbox_patch().set_facecolor(scatter.get_facecolor()[0])
        annot.get_bbox_patch().set_alpha(0.8)

    def hover(event):
        visible = annot.get_visible()
        if event.inaxes == ax:
            for scatter in scatter_artists:
                cont, ind = scatter.contains(event)
                if cont:
                    update_annot(scatter, ind)
                    annot.set_visible(True)
                    fig.canvas.draw_idle()
                    return
        if visible:
            annot.set_visible(False)
            fig.canvas.draw_idle()

    fig.canvas.mpl_connect("motion_notify_event", hover)

    plt.tight_layout()
    plt.show()


In [35]:
# Call the plotting function (will show plot interactively)
plot_hi_het_triangle_all_generations(all_generations_data)

# Optionally, save the plot by specifying a filename
# plot_hi_het_triangle_all_generations(all_generations_data, p_a_stats, p_b_stats, f1_stats, save_filename='my_hybrid_plot.png')