### Block Data Analysis Script

In [None]:
# this script will take the dataframe outputs from the simulation and analyse/visualise.

In [None]:
#Cell 1
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os # For path joining

In [None]:
#Cell 2
# Define the path to your saved chromatid_recombination_data.csv
# Make sure this path matches where you saved the file from your simulation script.
# Example: If you saved it in a 'simulation_results' folder:
data_folder = "simulation_results"
recombination_data_filename = "chromatid_recombination_data.csv"
full_path_to_recomb_data = os.path.join(data_folder, recombination_data_filename)

print(f"Attempting to load data from: {full_path_to_recomb_data}")

try:
    chromatid_recomb_df = pd.read_csv(full_path_to_recomb_data)
    print("Data loaded successfully!")
    print("\n--- Initial glance at the DataFrame ---")
    print(chromatid_recomb_df.head())
    print("\n--- DataFrame Info ---")
    chromatid_recomb_df.info()
except FileNotFoundError:
    print(f"Error: The file '{full_path_to_recomb_data}' was not found.")
    print("Please ensure you have run your simulation script and saved the recombination data.")
    print("Also, double-check the 'data_folder' and 'recombination_data_filename' variables.")
    exit() # Exit the script if data can't be loaded

In [None]:
# Cell 3

# If 'block_lengths' or 'block_alleles' are stored as strings (e.g., "[10, 20, 30]"),
# you'll need to convert them to actual lists.
# This is crucial for correctly analyzing block lengths.
def parse_list_string(s):
    try:
        # Evaluate the string as a Python literal (e.g., "[1, 2, 3]")
        return eval(s)
    except (SyntaxError, TypeError):
        return [] # Return empty list if parsing fails

if 'block_lengths' in chromatid_recomb_df.columns and isinstance(chromatid_recomb_df['block_lengths'].iloc[0], str):
    print("\nConverting 'block_lengths' from string to list...")
    chromatid_recomb_df['block_lengths'] = chromatid_recomb_df['block_lengths'].apply(parse_list_string)

if 'block_alleles' in chromatid_recomb_df.columns and isinstance(chromatid_recomb_df['block_alleles'].iloc[0], str):
    print("Converting 'block_alleles' from string to list...")
    chromatid_recomb_df['block_alleles'] = chromatid_recomb_df['block_alleles'].apply(parse_list_string)


# Ensure 'generation' is treated as a categorical type for consistent ordering in plots
# You might need a custom sort order for generations (e.g., P_A, P_B, F1, F2, BC1A, etc.)
# For now, we'll use a natural sort.
def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]

import re # Ensure re is imported for natural_sort_key

unique_generations = sorted(chromatid_recomb_df['generation'].unique(), key=natural_sort_key)
chromatid_recomb_df['generation'] = pd.Categorical(
    chromatid_recomb_df['generation'],
    categories=unique_generations,
    ordered=True
)

In [None]:
# Cell 4

print("\n--- Analysis: Average Number of Junctions per Chromatid ---")
avg_junctions_per_gen = chromatid_recomb_df.groupby('generation')['total_junctions'].mean().reset_index()
print("Average junctions per chromatid per generation:")
print(avg_junctions_per_gen)

In [None]:
# Cell 5

print("\n--- Analysis: Distribution of Junctions per Generation ---")

# Plotting the distribution of total_junctions for each generation
plt.figure(figsize=(12, 6))
sns.boxplot(x='generation', y='total_junctions', data=chromatid_recomb_df)
plt.title('Distribution of Total Junctions per Chromatid Across Generations')
plt.xlabel('Generation')
plt.ylabel('Total Junctions (Crossovers)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# You can also get counts for each number of junctions
junction_counts_per_gen = chromatid_recomb_df.groupby('generation')['total_junctions'].value_counts().unstack(fill_value=0)
print("\nCounts of junction numbers per generation:")
print(junction_counts_per_gen)


In [None]:
# Cell 6 

print("\n--- Analysis: Average Block Lengths per Chromatid ---")

# Explode the list of block lengths into individual rows for easier calculation
# This creates a new DataFrame where each block length gets its own row
exploded_blocks_df = chromatid_recomb_df.explode('block_lengths')

# Convert exploded 'block_lengths' to numeric, handling potential errors
exploded_blocks_df['block_lengths'] = pd.to_numeric(exploded_blocks_df['block_lengths'], errors='coerce')
exploded_blocks_df.dropna(subset=['block_lengths'], inplace=True) # Remove rows where conversion failed

avg_block_length_per_gen = exploded_blocks_df.groupby('generation')['block_lengths'].mean().reset_index()
print("Average block length per generation:")
print(avg_block_length_per_gen)

In [None]:
# Cell 7

print("\n--- Analysis: Distribution of Block Lengths per Generation ---")

# Plotting the distribution of block lengths for each generation
plt.figure(figsize=(12, 6))
sns.boxplot(x='generation', y='block_lengths', data=exploded_blocks_df)
plt.title('Distribution of Block Lengths Across Generations')
plt.xlabel('Generation')
plt.ylabel('Block Length (number of loci)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# You can also look at specific percentiles or histograms
# Example: Histogram for F2 block lengths
if 'F2' in exploded_blocks_df['generation'].unique():
    plt.figure(figsize=(8, 5))
    sns.histplot(exploded_blocks_df[exploded_blocks_df['generation'] == 'F2']['block_lengths'], bins=30, kde=True)
    plt.title('Distribution of Block Lengths in F2 Generation')
    plt.xlabel('Block Length (number of loci)')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()