In [9]:
# %%
# Recombination Class
import random
import numpy as np
import pandas as pd
from typing import List, Dict, Optional, Union
from dataclasses import dataclass, field
from __future__ import annotations # Enable postponed evaluation of annotations for cleaner syntax


@dataclass
class Marker:
    """Represents a genetic marker on a chromosome."""
    id: str
    physical_position: float
    genetic_position: float


@dataclass
class Chromosome:
    """Represents a chromosome with its properties and markers."""
    id: int
    physical_length_bp: float
    genetic_length_cM: float
    markers: List[Marker] = field(default_factory=list) # Default to an empty list


@dataclass
class Individual:
    """
    Represents a diploid individual with a pair of homologous chromosomes.

    Each individual carries two copies of each chromosome: one inherited from its
    biological mother and one from its biological father. These are stored in
    `maternal_chroms` and `paternal_chroms` respectively.

    Attributes:
        id (str): A unique identifier for the individual.
        maternal_chroms (Dict[int, Dict]): Stores data for chromosomes inherited from the
                                            individual's biological mother (keyed by chromosome ID).
                                            Each entry contains 'alleles' and 'positions'.
        paternal_chroms (Dict[int, Dict]): Stores data for chromosomes inherited from the
                                            individual's biological father (keyed by chromosome ID).
                                            Each entry contains 'alleles' and 'positions'.
    """
    id: str
    # maternal_chroms and paternal_chroms are initialized with default_factory for mutable defaults
    maternal_chroms: Dict[int, Dict[str, Union[List[int], List[float]]]] = field(default_factory=dict)
    paternal_chroms: Dict[int, Dict[str, Union[List[int], List[float]]]] = field(default_factory=dict)


    def produce_gamete(self,
                       chrom: Chromosome,
                       use_poisson: bool,
                       fixed_crossover_count: int,
                       custom_crossover_counts: Optional[list[int]],
                       custom_crossover_probs: Optional[list[float]],
                       detected_crossovers_list: list[Dict], # List to append detected crossovers
                       all_true_crossovers_list: list[Dict] # List to append all true crossovers
                       ) -> list[int]:
        """
        Simulates gamete production for a single chromosome, including recombination.

        Args:
            chrom (Chromosome): The chromosome for which to produce a gamete.
            use_poisson (bool): If True, use Poisson distribution for crossovers.
                                If False, use fixed_crossover_count or custom distribution.
            fixed_crossover_count (int): Fixed number of crossovers if use_poisson is False
                                         and no custom distribution is provided.
            custom_crossover_counts (Optional[list[int]]): Custom discrete counts for crossovers.
            custom_crossover_probs (Optional[list[float]]): Probabilities for custom_crossover_counts.
            detected_crossovers_list (list[Dict]): List to accumulate detected crossover info.
            all_true_crossovers_list (list[Dict]): List to accumulate all true crossover info.

        Returns:
            list[int]: A list of alleles representing the haploid gamete chromosome.
        """
        parental_maternal_alleles = self.maternal_chroms[chrom.id]['alleles']
        parental_paternal_alleles = self.paternal_chroms[chrom.id]['alleles']
        marker_positions = [m.genetic_position for m in chrom.markers] # Genetic positions for interpolation

        # Determine number of crossovers
        num_crossovers = 0
        if use_poisson and chrom.genetic_length_cM > 0:
            lambda_val = chrom.genetic_length_cM / 100.0 # Genetic length in Morgans
            num_crossovers = np.random.poisson(lambda_val)
        elif custom_crossover_counts and custom_crossover_probs:
            num_crossovers = np.random.choice(custom_crossover_counts, p=custom_crossover_probs)
        else: # Fixed crossover count
            num_crossovers = fixed_crossover_count

        # Generate crossover positions (genetic positions)
        crossover_positions_cM: list[float] = []
        if num_crossovers > 0 and chrom.genetic_length_cM > 0:
            crossover_positions_cM = sorted(
                [np.random.uniform(0, chrom.genetic_length_cM) for _ in range(num_crossovers)]
            )

        # Randomly choose initial contributing chromosome (maternal or paternal)
        initial_source_chromosome_idx = random.randint(0, 1) # 0 for maternal, 1 for paternal

        # Create the gamete by copying alleles based on the initial source
        if initial_source_chromosome_idx == 0:
            gamete_alleles = list(parental_maternal_alleles)
        else:
            gamete_alleles = list(parental_paternal_alleles)

        # Track the current source (starts with initial choice)
        current_source_chromosome_idx = initial_source_chromosome_idx

        # Keep track of the previous marker's allele in the gamete to detect switches
        previous_marker_allele_in_gamete: Optional[int] = None
        
        next_crossover_event_idx = 0
        for marker_idx, marker in enumerate(chrom.markers):
            # Record all true crossover events and switch the current source chromosome
            while next_crossover_event_idx < len(crossover_positions_cM) and \
                  (crossover_positions_cM[next_crossover_event_idx] <= marker.genetic_position):
                
                co_pos_cM = crossover_positions_cM[next_crossover_event_idx]
                
                # Append to all_true_crossovers_list
                all_true_crossovers_list.append({
                    'chromosome': chrom.id,
                    'genetic_position': co_pos_cM,
                    # Interpolate physical position using marker genetic and physical positions
                    'physical_position': np.interp(co_pos_cM, 
                                                   [m.genetic_position for m in chrom.markers], 
                                                   [m.physical_position for m in chrom.markers]) 
                                       if chrom.markers else 0.0,
                    'parent_id': self.id # Track which parent this crossover happened in
                })

                # A crossover has occurred, so switch the source chromosome
                current_source_chromosome_idx = 1 - current_source_chromosome_idx
                next_crossover_event_idx += 1

            # Determine the allele for the current marker based on the *final* source
            if current_source_chromosome_idx == 0: # From this individual's maternal chromosome
                current_marker_allele = parental_maternal_alleles[marker_idx]
            else: # From this individual's paternal chromosome
                current_marker_allele = parental_paternal_alleles[marker_idx]

            # Update the gamete allele at this marker position
            gamete_alleles[marker_idx] = current_marker_allele

            # Detect observable crossovers: A switch in allele value between markers
            # This accounts for scenarios where a true crossover occurs but doesn't lead
            # to an observable change in allele *at the marker sites*.
            if marker_idx > 0 and previous_marker_allele_in_gamete is not None:
                if current_marker_allele != previous_marker_allele_in_gamete:
                    # An observable switch in allele value (ancestry) has occurred between the
                    # previous marker and the current marker. Log this as a "detected" crossover.
                    detected_crossovers_list.append({
                        'chromosome': chrom.id,
                        'genetic_position_approx': marker.genetic_position,
                        'physical_position_approx': marker.physical_position,
                        'marker_interval_detection': f"{chrom.markers[marker_idx-1].id}-{marker.id}",
                        'parent_id': self.id # Track which parent this detected crossover came from
                    })

            previous_marker_allele_in_gamete = current_marker_allele

        return gamete_alleles


class RecombinationSimulator:
    """
    The main class for setting up and running genetic recombination simulations.

    It manages chromosome definitions, marker placement, allele frequency handling,
    and the simulation of gamete formation and offspring creation through crosses.
    """

    def __init__(self,
                 n_chromosomes: int = 4,
                 chromosome_sizes: list[float] | None = None,
                 n_markers: int = 10,
                 marker_distribution: str = 'uniform',  # 'uniform' or 'random'
                 use_poisson: bool = True,
                 use_centimorgan: bool = True,
                 allele_freq_file: str | None = None,
                 random_seed: int | None = None,
                 fixed_crossover_config: int | Dict[int, int] | None = None,
                 custom_crossover_counts: list[int] | None = None,
                 custom_crossover_probs: list[float] | None = None):

        self.n_chromosomes = n_chromosomes
        self.chromosome_sizes = chromosome_sizes or [1.0] * n_chromosomes
        self.n_markers = n_markers
        self.marker_distribution = marker_distribution
        self.use_poisson = use_poisson
        self.use_centimorgan = use_centimorgan
        self.allele_freq_file = allele_freq_file
        
        # Store fixed crossover configuration and parse it
        self.fixed_crossover_config = fixed_crossover_config
        self._fixed_crossover_uniform_count: int | None = None
        self._fixed_crossover_per_chrom_counts: Dict[int, int] | None = None

        if isinstance(self.fixed_crossover_config, int):
            self._fixed_crossover_uniform_count = self.fixed_crossover_config
            if self._fixed_crossover_uniform_count < 0:
                raise ValueError("Fixed uniform crossover count cannot be negative.")
        elif isinstance(self.fixed_crossover_config, dict):
            self._fixed_crossover_per_chrom_counts = self.fixed_crossover_config
            if any(count < 0 for count in self._fixed_crossover_per_chrom_counts.values()):
                raise ValueError("Fixed per-chromosome crossover counts cannot be negative.")
        elif self.fixed_crossover_config is not None:
            print("Warning: fixed_crossover_config was provided in an unsupported format. It will be ignored.")
            self.fixed_crossover_config = None

        # Store custom discrete distribution for crossover counts
        self._custom_crossover_counts = custom_crossover_counts
        self._custom_crossover_probs = custom_crossover_probs
        if self._custom_crossover_counts is not None and self._custom_crossover_probs is not None:
            if len(self._custom_crossover_counts) != len(self._custom_crossover_probs):
                raise ValueError("Custom crossover counts and probabilities lists must have the same length.")
            # Note: Probability normalization is now handled during argument parsing in main()


        # Set random seeds for reproducibility if provided
        self.random_seed = random_seed # Store the seed passed in
        if self.random_seed is not None:
            random.seed(self.random_seed)
            np.random.seed(self.random_seed)
            print(f"Set random seed to {self.random_seed}")

        # Load allele frequencies from a file or prepare to generate random ones
        self.allele_frequencies = self.load_allele_frequencies()

        # Normalise chromosome sizes so they sum to 1.0, ensuring proportional allocation
        total_size = sum(self.chromosome_sizes)
        if total_size <= 0:
            raise ValueError("Total chromosome size must be greater than zero.")
        self.chromosome_sizes = [s/total_size for s in self.chromosome_sizes]

        self.chromosomes: list[Chromosome] = []
        self.all_true_crossovers: list[Dict] = [] # Stores all actual crossovers that occurred during gamete production (from Poisson or fixed)
        self.detected_crossovers: list[Dict] = [] # Stores only those crossovers that were detectable by a change in marker alleles
        self.blind_spot_crossovers: list[Dict] = [] # Stores crossovers that occurred but were not detected by markers
        self.current_simulation_crossovers_info: Dict[str, Dict[str, list[Dict]]] = {} # To store info for the specific cross just run

    def load_allele_frequencies(self) -> Dict[str, Dict[int, float]]:
        """
        Loads allele frequencies from a CSV file if provided, otherwise generates random ones.

        CSV format: marker_id, allele_0_freq_pop1, allele_0_freq_pop2
        Returns:
            Dict[str, Dict[int, float]]: A dictionary where keys are marker IDs,
                                         and values are dictionaries containing allele 0 frequencies
                                         for population 0 and population 2.
                                         e.g., {'marker_id_1': {0: 0.9, 2: 0.1}}
        """
        if self.allele_freq_file:
            try:
                df = pd.read_csv(self.allele_freq_file)
                if not all(col in df.columns for col in ['marker_id', 'allele_0_freq_pop1', 'allele_0_freq_pop2']):
                    raise ValueError("Allele frequency CSV must contain 'marker_id', 'allele_0_freq_pop1', 'allele_0_freq_pop2' columns.")
                
                allele_freqs = {}
                for _, row in df.iterrows():
                    # Ensure frequencies are between 0 and 1
                    freq_pop1 = max(0.0, min(1.0, row['allele_0_freq_pop1']))
                    freq_pop2 = max(0.0, min(1.0, row['allele_0_freq_pop2']))
                    allele_freqs[row['marker_id']] = {0: freq_pop1, 2: freq_pop2}
                print(f"Loaded allele frequencies from {self.allele_freq_file}")
                return allele_freqs
            except FileNotFoundError:
                print(f"Warning: Allele frequency file '{self.allele_freq_file}' not found. Generating random frequencies.")
                return self._generate_random_allele_frequencies()
            except Exception as e:
                print(f"Error loading allele frequency file: {e}. Generating random frequencies.")
                return self._generate_random_allele_frequencies()
        else:
            return self._generate_random_allele_frequencies()

    def _generate_random_allele_frequencies(self) -> Dict[str, Dict[int, float]]:
        """Generates random allele frequencies for a default set of markers."""
        print("Generating random allele frequencies...")
        allele_freqs = {}
        # Generate marker IDs as 'marker_1', 'marker_2', etc., up to n_markers
        for i in range(self.n_markers): # Generate for the total number of markers
            marker_id = f"marker_{i+1}" 
            allele_freqs[marker_id] = {
                0: np.random.uniform(0.1, 0.9), # Allele 0 freq in Pop A
                2: np.random.uniform(0.1, 0.9)  # Allele 0 freq in Pop B
            }
        return allele_freqs


    def create_chromosomes(self, base_length: float = 100_000_000, base_genetic_length: float = 100.0):
        """
        Creates Chromosome objects based on configured sizes.

        Args:
            base_length (float): The physical length (in base pairs) for a chromosome
                                 with a relative size of 1.0.
            base_genetic_length (float): The genetic length (in centiMorgans) for a chromosome
                                         with a relative size of 1.0.
        """
        self.chromosomes = []
        for i in range(self.n_chromosomes):
            relative_size = self.chromosome_sizes[i]
            physical_len = base_length * relative_size
            genetic_len = base_genetic_length * relative_size

            # If not using cM model, genetic length is 0 for crossover purposes
            if not self.use_centimorgan:
                genetic_len = 0.0 # This ensures no recombination if cM model is off

            self.chromosomes.append(
                Chromosome(id=i+1, physical_length_bp=physical_len, genetic_length_cM=genetic_len, markers=[])
            )

    def assign_markers_to_chromosomes(self):
        """
        Assigns markers to the created chromosomes based on the specified distribution.
        Markers are evenly distributed or randomly placed.
        Marker IDs are in the format 'marker_N' using a global sequential counter.
        """
        total_markers_per_chromosome = self.n_markers // self.n_chromosomes
        remaining_markers = self.n_markers % self.n_chromosomes

        # Use a global counter for marker IDs to ensure unique, sequential naming across all chromosomes
        current_global_marker_id_counter = 1 

        for chrom in self.chromosomes:
            num_markers_on_this_chrom = total_markers_per_chromosome
            if remaining_markers > 0:
                num_markers_on_this_chrom += 1
                remaining_markers -= 1

            chrom.markers = [] # Clear any existing markers

            if self.marker_distribution == 'uniform':
                for j in range(num_markers_on_this_chrom):
                    divisor = num_markers_on_this_chrom + 1 if num_markers_on_this_chrom > 0 else 1 
                    physical_pos = chrom.physical_length_bp * ((j + 1) / divisor)
                    genetic_pos = chrom.genetic_length_cM * ((j + 1) / divisor)
                    
                    marker_id = f"marker_{current_global_marker_id_counter}" 
                    chrom.markers.append(
                        Marker(id=marker_id,
                               physical_position=physical_pos,
                               genetic_position=genetic_pos)
                    )
                    current_global_marker_id_counter += 1 # Increment global counter
            elif self.marker_distribution == 'random':
                for _ in range(num_markers_on_this_chrom):
                    physical_pos = np.random.uniform(0, chrom.physical_length_bp)
                    genetic_pos = np.random.uniform(0, chrom.genetic_length_cM)
                    
                    marker_id = f"marker_{current_global_marker_id_counter}" 
                    chrom.markers.append(
                        Marker(id=marker_id,
                               physical_position=physical_pos,
                               genetic_position=genetic_pos)
                    )
                    current_global_marker_id_counter += 1 # Increment global counter
                # Sort markers by physical position after random placement
                chrom.markers.sort(key=lambda m: m.physical_position)
            

    def create_ancestral_individual(self, individual_id: str, population_label: int) -> Individual:
        """
        Creates an Individual with chromosomes whose alleles are sampled
        based on the loaded allele frequencies for a specific population (0 or 2).

        Args:
            individual_id (str): A unique identifier for the individual.
            population_label (int): The population to draw alleles from (0 for Pop A, 2 for Pop B).

        Returns:
            Individual: A new Individual object with sampled genotypes.
        """
        # Create an empty Individual dataclass instance
        new_individual = Individual(id=individual_id)

        # For each chromosome, we need to create two homologous copies
        for chrom in self.chromosomes:
            maternal_alleles = []
            paternal_alleles = []
            marker_physical_positions = []

            for marker in chrom.markers:
                # Get the frequency of allele 0 for the specified population
                if marker.id not in self.allele_frequencies:
                    print(f"Warning: Marker {marker.id} not found in loaded allele frequencies. Using default 0.5 for population {population_label}.")
                    freq_allele_0 = 0.5
                else:
                    freq_allele_0 = self.allele_frequencies[marker.id][population_label]

                # Sample allele for maternal chromosome at this locus
                maternal_allele = 0 if np.random.rand() < freq_allele_0 else 2
                maternal_alleles.append(maternal_allele)

                # Sample allele for paternal chromosome at this locus (assumed to be from same population for a "pure" parent)
                paternal_allele = 0 if np.random.rand() < freq_allele_0 else 2
                paternal_alleles.append(paternal_allele)

                marker_physical_positions.append(marker.physical_position)

            # Populate the maternal_chroms and paternal_chroms dictionaries directly
            new_individual.maternal_chroms[chrom.id] = {
                'alleles': maternal_alleles,
                'positions': marker_physical_positions
            }
            new_individual.paternal_chroms[chrom.id] = {
                'alleles': paternal_alleles,
                'positions': marker_physical_positions
            }
        return new_individual


    def calculate_hybrid_index(self, individual: Individual) -> float:
        """
        Calculates the hybrid index (proportion of ancestry from Population 0) for an individual.

        Args:
            individual (Individual): The individual to calculate the hybrid index for.

        Returns:
            float: The proportion of alleles originating from Population 0 (ancestry=0).
        """
        total_markers = 0
        pop0_alleles = 0
        for chrom_id in individual.maternal_chroms:
            m_alleles = individual.maternal_chroms[chrom_id]['alleles']
            p_alleles = individual.paternal_chroms[chrom_id]['alleles']

            total_markers += len(m_alleles) + len(p_alleles)
            pop0_alleles += sum(1 for allele in m_alleles if allele == 0)
            pop0_alleles += sum(1 for allele in p_alleles if allele == 0)

        if total_markers == 0:
            return 0.0 # Avoid division by zero
        return pop0_alleles / total_markers

    def calculate_heterozygosity(self, individual: Individual) -> float:
        """
        Calculates the average heterozygosity (proportion of heterozygous markers) for an individual.

        A marker is considered heterozygous if the alleles on the maternal and paternal
        homologous chromosomes at that marker position are different (e.g., one is 0 and the other is 2).

        Args:
            individual (Individual): The individual to calculate heterozygosity for.

        Returns:
            float: The proportion of markers that are heterozygous.
        """
        total_markers = 0
        heterozygous_markers = 0
        for chrom_id in individual.maternal_chroms:
            m_alleles = individual.maternal_chroms[chrom_id]['alleles']
            p_alleles = individual.paternal_chroms[chrom_id]['alleles']

            for i in range(len(m_alleles)):
                total_markers += 1
                if m_alleles[i] != p_alleles[i]:
                    heterozygous_markers += 1

        if total_markers == 0:
            return 0.0 # Avoid division by zero
        return heterozygous_markers / total_markers

    def print_summary(self):
        """
        Prints a summary of the current simulation parameters and recombination events.
        This focuses on the most recently simulated cross.
        """
        print("\n--- Simulation Summary ---")
        print(f"Number of Chromosomes: {self.n_chromosomes}")
        print(f"Total Markers: {self.n_markers}")
        print(f"Marker Distribution: {self.marker_distribution}")
        print(f"Crossover Model: {'Poisson' if self.use_poisson else 'Fixed/Custom'}")
        if not self.use_poisson:
            if self._custom_crossover_counts is not None:
                dist_str = ', '.join([f"{c}:{p:.2f}" for c, p in zip(self._custom_crossover_counts, self._custom_crossover_probs or [])])
                print(f"  Custom Crossover Distribution (counts:probabilities): {dist_str}")
            elif self._fixed_crossover_uniform_count is not None:
                print(f"  Fixed Crossovers per Chromosome (Uniform): {self._fixed_crossover_uniform_count}")
            elif self._fixed_crossover_per_chrom_counts:
                print(f"  Fixed Crossovers per Chromosome (Per Chrom ID): {self._fixed_crossover_per_chrom_counts}")
            else:
                print(f"  No Crossovers (Fixed or Custom not set and Poisson is off)")
        print(f"Using cM distances for recombination: {self.use_centimorgan}")
        print(f"Random Seed: {'Not set' if self.random_seed is None else 'Set to ' + str(self.random_seed)}")
        
        print("\n--- Recombination Events (Last Cross) ---")
        print(f"Total true crossovers (occurred): {len(self.all_true_crossovers)}")
        print(f"Total detected crossovers (observable): {len(self.detected_crossovers)}")
        print(f"Total blind spot crossovers (undetected): {len(self.blind_spot_crossovers)}")

        if self.all_true_crossovers:
            df_true = pd.DataFrame(self.all_true_crossovers)
            print("\nTrue Crossovers by Chromosome:")
            print(df_true['chromosome'].value_counts().sort_index())

        if self.detected_crossovers:
            df_detected = pd.DataFrame(self.detected_crossovers)
            print("\nDetected Crossovers by Chromosome:")
            print(df_detected['chromosome'].value_counts().sort_index())


    def simulate_recombination(self, parent1: Individual, parent2: Individual) -> Individual:
        """
        Simulates a genetic cross between two parent individuals to produce one offspring.

        This involves:
        1. Each parent producing a haploid gamete through meiosis.
        2. Combining the two gametes (one from each parent) to form a new diploid offspring.
        All true crossover events are tracked, as are those detected by marker switches.

        Args:
            parent1 (Individual): The first parent individual.
            parent2 (Individual): The second parent individual.

        Returns:
            Individual: A new `Individual` object representing the offspring.
        """
        # Initialize empty dictionaries to hold the offspring's chromosome data
        offspring_maternal_chroms_data: Dict[int, Dict[str, Union[List[int], List[float]]]] = {}
        offspring_paternal_chroms_data: Dict[int, Dict[str, Union[List[int], List[float]]]] = {}

        # Clear previous simulation's crossover data for a clean slate for the new cross
        self.all_true_crossovers = []
        self.detected_crossovers = []
        self.blind_spot_crossovers = []
        self.current_simulation_crossovers_info = {'parent1': {'all_true': [], 'detected': []},
                                                    'parent2': {'all_true': [], 'detected': []}}

        print(f"  Crossing {parent1.id} with {parent2.id} to produce offspring ...")

        for chrom in self.chromosomes:
            # Determine fixed crossover count for this chromosome
            fixed_count_for_this_chrom = 0  
            if self._fixed_crossover_uniform_count is not None:
                fixed_count_for_this_chrom = self._fixed_crossover_uniform_count
            elif self._fixed_crossover_per_chrom_counts is not None:
                fixed_count_for_this_chrom = self._fixed_crossover_per_chrom_counts.get(chrom.id, 0)


            # Parent 1 produces its gamete for this chromosome
            p1_gamete_detected_crossovers: list[Dict] = []
            p1_gamete_all_true_crossovers: list[Dict] = []

            p1_gamete_alleles = parent1.produce_gamete(
                chrom,
                self.use_poisson,
                fixed_count_for_this_chrom,
                self._custom_crossover_counts,
                self._custom_crossover_probs,
                p1_gamete_detected_crossovers,
                p1_gamete_all_true_crossovers
            )
            self.current_simulation_crossovers_info['parent1']['all_true'].extend(p1_gamete_all_true_crossovers)
            self.current_simulation_crossovers_info['parent1']['detected'].extend(p1_gamete_detected_crossovers)

            # Parent 2 produces its gamete for this chromosome
            p2_gamete_detected_crossovers: list[Dict] = []
            p2_gamete_all_true_crossovers: list[Dict] = []

            p2_gamete_alleles = parent2.produce_gamete(
                chrom,
                self.use_poisson,
                fixed_count_for_this_chrom,
                self._custom_crossover_counts,
                self._custom_crossover_probs,
                p2_gamete_detected_crossovers,
                p2_gamete_all_true_crossovers
            )
            self.current_simulation_crossovers_info['parent2']['all_true'].extend(p2_gamete_all_true_crossovers)
            self.current_simulation_crossovers_info['parent2']['detected'].extend(p2_gamete_detected_crossovers)

            # The offspring inherits one gamete chromosome from parent1 (becomes its maternal)
            # and one from parent2 (becomes its paternal). The assignment is random.
            marker_physical_positions = [m.physical_position for m in chrom.markers]
            if random.random() < 0.5:
                # Parent 1's gamete becomes the offspring's maternal chromosome
                offspring_maternal_chroms_data[chrom.id] = {
                    'alleles': p1_gamete_alleles,
                    'positions': marker_physical_positions
                }
                # Parent 2's gamete becomes the offspring's paternal chromosome
                offspring_paternal_chroms_data[chrom.id] = {
                    'alleles': p2_gamete_alleles,
                    'positions': marker_physical_positions
                }
            else:
                # Parent 2's gamete becomes the offspring's maternal chromosome
                offspring_maternal_chroms_data[chrom.id] = {
                    'alleles': p2_gamete_alleles,
                    'positions': marker_physical_positions
                }
                # Parent 1's gamete becomes the offspring's paternal chromosome
                offspring_paternal_chroms_data[chrom.id] = {
                    'alleles': p1_gamete_alleles,
                    'positions': marker_physical_positions
                }

        # Create the offspring Individual dataclass instance using the populated dictionaries
        offspring = Individual(
            id=f"offspring_{random.randint(1000,9999)}",
            maternal_chroms=offspring_maternal_chroms_data,
            paternal_chroms=offspring_paternal_chroms_data
        )
        print(f"  Offspring created: {offspring.id}")

        # Consolidate crossover data after both gametes have been produced
        self.all_true_crossovers = self.current_simulation_crossovers_info['parent1']['all_true'] + \
                                   self.current_simulation_crossovers_info['parent2']['all_true']

        self.detected_crossovers = self.current_simulation_crossovers_info['parent1']['detected'] + \
                                   self.current_simulation_crossovers_info['parent2']['detected']

        # Calculate blind spots: any true crossover that was not detected
        detected_genetic_positions_rounded = set()
        for d in self.detected_crossovers:
            # Use a tuple for the set element to ensure it's hashable
            detected_genetic_positions_rounded.add((d['chromosome'], round(d['genetic_position_approx'], 2)))

        self.blind_spot_crossovers = []
        for tc in self.all_true_crossovers:
            if (tc['chromosome'], round(tc['genetic_position'], 2)) not in detected_genetic_positions_rounded:
                self.blind_spot_crossovers.append(tc)

        return offspring

In [10]:
# %%
# Runs a small test simulation 
def main_toy_run():
    """
    Runs a small, fixed simulation for testing and debugging purposes in a Jupyter environment.
    This bypasses argparse and directly sets parameters for a quick test.
    """
    print("--- Running Toy Simulation (main_toy_run) ---")
    
    # Define fixed parameters for the toy run
    n_chromosomes = 1
    n_markers = 5
    chromosome_sizes = [1.0] # Only one chromosome
    marker_distribution = 'uniform'
    use_poisson = False # SET THIS TO FALSE to use the custom distribution
    random_seed = 42 # A fixed seed for reproducible toy runs
    output_filename = 'toy_simulation_results.json'

    # Define a custom discrete distribution for crossover counts for the toy run
    # Example: 20% chance of 0 crossovers, 80% chance of 1 crossover
    custom_counts_toy = [0, 1]
    custom_probs_toy = [0.2, 0.8] # Sum should be 1.0 (or close)

     # HARDCODE ALLELE FREQUENCY FILE PATH HERE
    allele_freq_file_path = r"C:\Users\sophi\Jupyter_projects\Hybrid_Code\real_world_sim\allele_freq_test.csv" # Hardcoded CSV test
    
    # Initialise the recombination simulator
    sim = RecombinationSimulator(
        n_chromosomes=n_chromosomes,
        chromosome_sizes=chromosome_sizes,
        n_markers=n_markers,
        marker_distribution=marker_distribution,
        use_poisson=use_poisson,
        random_seed=random_seed,
        custom_crossover_counts=custom_counts_toy,
        custom_crossover_probs=custom_probs_toy,
        allele_freq_file=allele_freq_file_path
        # fixed_crossover_config is not used if custom_crossover_counts is provided
    )

    # Step 1: Create the chromosomes
    print("\nCreating chromosomes for toy run...")
    sim.create_chromosomes(base_length=10_000, base_genetic_length=10.0) # Smaller lengths for toy

    # Step 2: Assign markers
    print("Assigning markers to chromosomes for toy run...")
    sim.assign_markers_to_chromosomes()

    # Step 3: Create founding populations
    # Step 3: Create founding populations
    print("Creating founding populations (Parent A and Parent B) for toy run using loaded allele frequencies...")
    # --- START OF REQUIRED CHANGE ---
    parent_A = sim.create_ancestral_individual("P_A_Toy", population_label=0) # Use the new method
    parent_B = sim.create_ancestral_individual("P_B_Toy", population_label=2) # Use the new method
    # --- END OF REQUIRED CHANGE ---

    # Step 4: Simulate the F1 hybrid generation
    print("Simulating F1 generation (P_A x P_B) for toy run...")
    offspring_F1 = sim.simulate_recombination(parent_A, parent_B)

    # Calculate and print metrics for the F1 offspring
    hybrid_index_F1 = sim.calculate_hybrid_index(offspring_F1)
    heterozygosity_F1 = sim.calculate_heterozygosity(offspring_F1)

    print(f"\n--- Toy F1 Generation Results (Offspring {offspring_F1.id}) ---")
    print(f"F1 Hybrid Index (Proportion from Pop 0): {hybrid_index_F1:.3f}")
    print(f"F1 Heterozygosity: {heterozygosity_F1:.3f}")

    # Print summary of the simulation parameters and the recombination events from the F1 cross
    sim.print_summary()

    # Save results to a JSON file (using the NpEncoder defined at the top)
    try:
        results_to_save = {
            'Toy_F1_generation': {
                'offspring_id': offspring_F1.id,
                'hybrid_index': hybrid_index_F1,
                'heterozygosity': heterozygosity_F1,
                'crossovers_info_F1': {
                    # Capture both parents' crossover info from the F1 cross
                    'parent1_all_true': sim.current_simulation_crossovers_info['parent1']['all_true'],
                    'parent1_detected': sim.current_simulation_crossovers_info['parent1']['detected'],
                    'parent2_all_true': sim.current_simulation_crossovers_info['parent2']['all_true'],
                    'parent2_detected': sim.current_simulation_crossovers_info['parent2']['detected']
                },
                'offspring_chromosomes': {
                    'maternal_chroms': {k: v for k, v in offspring_F1.maternal_chroms.items()},
                    'paternal_chroms': {k: v for k, v in offspring_F1.paternal_chroms.items()}
                }
            },
            'chromosome_info': [asdict(chrom) for chrom in sim.chromosomes]
        }

        with open(output_filename, 'w') as f:
            json.dump(results_to_save, f, indent=2, cls=NpEncoder)
        print(f"\nToy simulation results saved to '{output_filename}'")
    except Exception as e:
        print(f"\nError saving toy simulation results to JSON: {e}")

    print("--- Toy Simulation Finished ---")

In [11]:
# %%
# Final Execution Block
# This block allows you to run the simulation by calling the main functions.
# In Jupyter, ensure all cells above this have been run before executing this cell.

if __name__ == "__main__":
    # --- Uncomment one of the following lines to run ---

    # 1. Run the small toy example for debugging and testing in Jupyter:
    main_toy_run() # This will run a toy simulation with the 0:0.2, 1:0.8 custom distribution

    # 2. Run the full simulation with default command-line parameters (as defined in argparse defaults):
    #    This would effectively result in 0 crossovers if --use_poisson is not present
    # main() 

    # 3. Run the full simulation with custom command-line parameters:
    #    Example: Use the custom distribution (20% for 0 CO, 80% for 1 CO) for 4 chromosomes.
    # main(cli_args=[
    #     '--n_chromosomes', '4',
    #     '--n_markers', '50',
    #     # IMPORTANT: Do NOT include '--use_poisson' here to activate the custom distribution
    #     '--random_seed', '123',
    #     '--output', 'custom_crossover_dist_results.json',
    #     '--chromosome_sizes', '1.0,0.8,0.6,0.4',
        
    #     # Specify the custom discrete crossover distribution
    #     # Format: "count1:prob1,count2:prob2,..."
    #     '--custom_crossover_dist', '0:0.2,1:0.8' 
    # ])

    # Example: To force 1 crossover on ALL chromosomes (using the fixed_crossovers argument)
    # main(cli_args=[
    #     '--n_chromosomes', '4',
    #     '--n_markers', '50',
    #     '--random_seed', '123',
    #     '--output', 'fixed_1_crossover_run_results.json',
    #     '--chromosome_sizes', '1.0,0.8,0.6,0.4',
    #     '--fixed_crossovers', '1' # This will set 1 crossover for EACH of your 4 chromosomes
    # ])

--- Running Toy Simulation (main_toy_run) ---
Set random seed to 42
Loaded allele frequencies from C:\Users\sophi\Jupyter_projects\Hybrid_Code\real_world_sim\allele_freq_test.csv

Creating chromosomes for toy run...
Assigning markers to chromosomes for toy run...
Creating founding populations (Parent A and Parent B) for toy run using loaded allele frequencies...
Simulating F1 generation (P_A x P_B) for toy run...
  Crossing P_A_Toy with P_B_Toy to produce offspring ...
  Offspring created: offspring_5012

--- Toy F1 Generation Results (Offspring offspring_5012) ---
F1 Hybrid Index (Proportion from Pop 0): 0.500
F1 Heterozygosity: 0.600

--- Simulation Summary ---
Number of Chromosomes: 1
Total Markers: 5
Marker Distribution: uniform
Crossover Model: Fixed/Custom
  Custom Crossover Distribution (counts:probabilities): 0:0.20, 1:0.80
Using cM distances for recombination: True
Random Seed: Set to 42

--- Recombination Events (Last Cross) ---
Total true crossovers (occurred): 2
Total detec