In [1]:
#!/usr/bin/env python
# coding: utf-8
"""
============================================================================
CORRECTED DNAformer Synthetic Data Generator - ALL CONFIGURATIONS
============================================================================

Based on: "Scalable and robust DNA-based storage via coding theory and deep learning"
Nature Machine Intelligence, 2025

This script generates synthetic data for ALL dataset configurations mentioned
in the DNAformer paper and GitHub code.

Dataset configurations extracted from:
- train.py config class comments
- error_rates_setup.py set_values() function
- Supplementary materials

Author: Corrected based on DNAformer paper and GitHub implementation
============================================================================
"""

import os
import random
import copy
import numpy as np
from tqdm import tqdm
from pathlib import Path

# Import from your existing DataGenerator package
from DataGenerator.error_rates_setup import ErrorRates
from DataGenerator.cluster_generator import ClusterGenerator


# ============================================================================
# ALL DATASET CONFIGURATIONS (from train.py and error_rates_setup.py)
# ============================================================================

DATASET_CONFIGS = {
    # =========================================================================
    # PUBLIC BENCHMARK DATASETS (Open source datasets from train.py comments)
    # =========================================================================
    
    "Erlich": {
        # From train.py: Erlich: cluster: 72,000 label: 152 dev: 10 index: 16 
        #                sequencing: Ilumina miSeq synthesis: Twist Bioscience
        "label_length": 152,
        "max_deviation": 10,
        "index_length": 16,
        "sequencing_tech": "Ilumina miSeq",
        "synthesis_tech": "Twist Bioscience",
        "error_rates_method": "set_EZ17_values",
        "uses_noise_coef": False,
        "description": "Erlich et al. (2017) - DNA Fountain, Twist + Illumina miSeq",
        "real_cluster_count": 72000
    },
    
    "Grass": {
        # From train.py: Grass: cluster: 4989 label: 117 dev: 11 index: 13
        #                sequencing: Ilumina miSeq synthesis: CustomArray
        "label_length": 117,
        "max_deviation": 11,
        "index_length": 13,
        "sequencing_tech": "Ilumina miSeq",
        "synthesis_tech": "CustomArray",
        "error_rates_method": "set_G15_values",
        "uses_noise_coef": False,
        "description": "Grass et al. (2015) - CustomArray + Illumina miSeq",
        "real_cluster_count": 4989
    },
    
    "Organick": {
        # From train.py: Luis: cluster: 596,499 label: 110 dev: 5 index: 33
        #                sequencing: Ilumina NextSeq synthesis: Twist Bioscience
        "label_length": 110,
        "max_deviation": 5,
        "index_length": 33,
        "sequencing_tech": "Ilumina NextSeq",
        "synthesis_tech": "Twist Bioscience",
        "error_rates_method": "set_O17_values",
        "uses_noise_coef": False,
        "description": "Organick et al. (2018) - Twist + Illumina NextSeq",
        "real_cluster_count": 596499
    },
    
    "Srinivasavaradhan": {
        # From train.py: Pfitser: cluster: 9984 label: 110 dev: 5 index: 4
        #                sequencing: MinION synthesis: Twist Bioscience
        "label_length": 110,
        "max_deviation": 5,
        "index_length": 4,
        "sequencing_tech": "MinION",
        "synthesis_tech": "Twist Bioscience",
        "error_rates_method": "set_R21_values",
        "uses_noise_coef": False,
        "description": "Srinivasavaradhan et al. (Pfitser) - Twist + MinION",
        "real_cluster_count": 9984
    },
    
    # =========================================================================
    # DNAformer's OWN DATASETS (Their pilot and full datasets)
    # =========================================================================
    
    "DNAformer_Illumina_Full": {
        # From train.py: full_illumina cluster: 109,944 label: 128 dev: 4 index: 12
        #                sequencing: Ilumina miSeq-0922 synthesis: Twist Bioscience-0922
        "label_length": 140,  # 128 after removing 12-bp index, but we generate full 140
        "max_deviation": 4,
        "index_length": 12,
        "sequencing_tech": "Ilumina miSeq-0922",
        "synthesis_tech": "Twist Bioscience-0922",
        "error_rates_method": "set_BOS22_values",
        "uses_noise_coef": False,
        "description": "DNAformer Full Illumina (2022) - Twist + miSeq",
        "real_cluster_count": 109944
    },
    
    "DNAformer_Illumina_Pilot": {
        # From train.py: pilot_illumina: cluster: 1,000 label: 128 dev: 4 index: 12
        #                sequencing: Ilumina miSeq synthesis: Twist Bioscience
        "label_length": 140,
        "max_deviation": 4,
        "index_length": 12,
        "sequencing_tech": "Ilumina miSeq",
        "synthesis_tech": "Twist Bioscience",
        "error_rates_method": "set_EZ17_values",  # Uses EZ17 for pilot
        "uses_noise_coef": False,
        "description": "DNAformer Pilot Illumina - Twist + miSeq",
        "real_cluster_count": 1000
    },
    
    "DNAformer_Nanopore_Pilot": {
        # From train.py: pilot_nanopore: cluster: 1,000 label: 128 dev: 10 index: 12
        #                sequencing: MinIONShort synthesis: Twist Bioscience
        "label_length": 140,
        "max_deviation": 10,
        "index_length": 12,
        "sequencing_tech": "MinIONShort",
        "synthesis_tech": "Twist Bioscience",
        "error_rates_method": "set_B22_values",
        "uses_noise_coef": False,
        "description": "DNAformer Pilot Nanopore - Twist + MinION Short",
        "real_cluster_count": 1000
    },
    
    "DNAformer_Nanopore_Full": {
        # From train.py: full_nanopore cluster: 109,944 label: 128 dev: 4 index: 12
        # Uses Nanopore_pilot_v2_multi with noise coefficients
        "label_length": 140,
        "max_deviation": 4,
        "index_length": 12,
        "sequencing_tech": "Nanopore_pilot_v2_multi",
        "synthesis_tech": "Twist Bioscience_nanopore_pilot_v2_multi",
        "error_rates_method": "set_BOS22PILOTOMER_values_multi",
        "uses_noise_coef": True,
        "noise_coef": {'del_mult': 1.25, 'ins_mult': 1.25, 'sub_mult': 1.25},
        "description": "DNAformer Full Nanopore - Twist + MinION (with multipliers)",
        "real_cluster_count": 109944
    },
}


# ============================================================================
# AUGMENTATION CONFIGURATION (from train.py config class)
# ============================================================================

class AugmentationConfig:
    """
    Data augmentation parameters from the DNAformer paper.
    
    From train.py:
        generate_data_noise    = 0.1    # [0-1] std from nominal value
        max_false_copies       = 2      # max number of copies inserted to cluster
        false_copies_prob      = 0.3
        min_cluster_size_for_false_copies = 4
    """
    
    # Noise variation - adds Gaussian noise to error rates
    # Paper: "varies the standard deviation of the generated noise statistics"
    generate_data_noise = 0.1  # δ = 0.1 (10% std from nominal)
    
    # False copies injection - simulates clustering errors
    # Paper: "injects random false copies into the training process"
    max_false_copies = 2
    false_copies_prob = 0.3  # 30% probability
    min_cluster_size_for_false_copies = 4
    
    # Cluster size range
    min_cluster_size = 1
    max_cluster_size = 16


# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def num2dna(seq):
    """Convert numeric sequence to DNA string."""
    mapping = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}
    return ''.join([mapping[i] for i in seq])


def inject_false_copies(noisy_copies, config):
    """
    Inject random false copies into a cluster.
    
    This is a KEY augmentation from the paper that simulates clustering errors
    where reads from different sequences get incorrectly assigned to a cluster.
    
    From the paper (Supplementary):
    "The first mechanism injects random false copies into the training process.
    This helps the model learn how to ignore such cases that occur due to
    clustering errors."
    
    From data_loader.py get_false_copies():
        if np.random.random() < config.false_copies_prob and 
           len(noisy_copies) > config.min_cluster_size_for_false_copies:
            num_false_copies = np.random.randint(1, config.max_false_copies+1)
            # Replace random copies with completely random sequences
    """
    # Only inject if cluster is large enough and random check passes
    if (np.random.random() < config.false_copies_prob and 
        len(noisy_copies) > config.min_cluster_size_for_false_copies):
        
        # Randomly choose how many false copies (1 to max)
        num_false_copies = np.random.randint(1, config.max_false_copies + 1)
        
        # Pick random positions to replace (without replacement)
        num_to_replace = min(num_false_copies, len(noisy_copies))
        indices_to_replace = np.random.choice(
            len(noisy_copies), 
            size=num_to_replace,
            replace=False
        )
        
        # Replace with completely random sequences
        for idx in indices_to_replace:
            original_length = len(noisy_copies[idx])
            # Generate random DNA sequence of same length
            random_seq = num2dna(np.random.randint(4, size=(original_length,)))
            noisy_copies[idx] = random_seq
    
    return noisy_copies


def create_random_label(length):
    """Generate a random DNA sequence of given length."""
    bases = ['A', 'C', 'G', 'T']
    return ''.join(random.choices(bases, k=length))


# ============================================================================
# MAIN GENERATOR FUNCTION
# ============================================================================

def generate_synthetic_dataset(
    dataset_name,
    num_clusters,
    output_dir="generated_data_corrected",
    use_false_copies=True,
    use_noise_deviation=True,
    custom_noise_coef=None
):
    """
    Generate synthetic dataset for a specific configuration.
    
    Parameters:
    -----------
    dataset_name : str
        One of the keys in DATASET_CONFIGS
    num_clusters : int
        Number of clusters to generate
    output_dir : str
        Directory to save generated data
    use_false_copies : bool
        Whether to inject false copies (recommended: True)
    use_noise_deviation : bool
        Whether to vary error rates per cluster with delta=0.1 (recommended: True)
    custom_noise_coef : dict, optional
        Custom multipliers for error rates
    
    Returns:
    --------
    str : Path to generated file
    """
    # Validate dataset name
    if dataset_name not in DATASET_CONFIGS:
        available = list(DATASET_CONFIGS.keys())
        raise ValueError(f"Unknown dataset '{dataset_name}'.\nAvailable: {available}")
    
    ds_config = DATASET_CONFIGS[dataset_name]
    aug_config = AugmentationConfig()
    
    # Print configuration
    print("\n" + "="*70)
    print(f"GENERATING: {dataset_name}")
    print("="*70)
    print(f"Description: {ds_config['description']}")
    print(f"\nDataset Parameters:")
    print(f"  Label length:     {ds_config['label_length']} bp")
    print(f"  Max deviation:    {ds_config['max_deviation']} bp")
    print(f"  Max read length:  {ds_config['label_length'] + ds_config['max_deviation']} bp")
    print(f"  Index length:     {ds_config['index_length']} bp")
    print(f"  Sequencing:       {ds_config['sequencing_tech']}")
    print(f"  Synthesis:        {ds_config['synthesis_tech']}")
    print(f"  Error rates:      {ds_config['error_rates_method']}()")
    print(f"\nAugmentation Settings:")
    print(f"  Noise deviation (δ): {'0.1 (enabled)' if use_noise_deviation else 'disabled'}")
    print(f"  False copies:        {'enabled (30% prob, max 2)' if use_false_copies else 'disabled'}")
    print(f"  Cluster size:        {aug_config.min_cluster_size}-{aug_config.max_cluster_size}")
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    output_filename = f"binned_synthetic_{dataset_name.lower()}.txt"
    output_path = os.path.join(output_dir, output_filename)
    print(f"\nOutput: {output_path}")
    print(f"Clusters to generate: {num_clusters:,}")
    print("="*70)
    
    # Initialize error rates
    errors_prob = ErrorRates()
    
    # Call the CORRECT method for this dataset
    method_name = ds_config['error_rates_method']
    method = getattr(errors_prob, method_name)
    
    if ds_config.get('uses_noise_coef', False):
        # Some methods require noise_coef parameter
        noise_coef = custom_noise_coef or ds_config.get('noise_coef', {
            'del_mult': 1.0, 'ins_mult': 1.0, 'sub_mult': 1.0
        })
        method(noise_coef)
        print(f"  Applied noise coefficients: {noise_coef}")
    else:
        method()
    
    label_length = ds_config['label_length']
    delta = aug_config.generate_data_noise if use_noise_deviation else 0
    separator = "******************"
    
    # Generate data
    with open(output_path, 'w') as f:
        for _ in tqdm(range(num_clusters), desc=f"Generating {dataset_name}"):
            # 1. Generate random label sequence
            label = create_random_label(label_length)
            
            # 2. Create cluster generator with correct error rates
            generator = ClusterGenerator(
                total_error_rates=errors_prob.general_errors,
                base_error_rates=errors_prob.per_base_errors,
                strand=label,
                min_copies=aug_config.min_cluster_size,
                max_copies=aug_config.max_cluster_size
            )
            
            # 3. Generate noisy reads with optional noise deviation
            # delta > 0 adds Gaussian variation to error rates per cluster
            generator.generate_cluster(delta=delta)
            reads = list(generator.copies)  # Make a copy
            
            # 4. KEY: Inject false copies to simulate clustering errors
            if use_false_copies:
                reads = inject_false_copies(reads, aug_config)
            
            # 5. Write to binned format
            f.write(label + "\n")
            f.write(separator + "\n")
            for read in reads:
                f.write(read + "\n")
            f.write("\n")
    
    print(f"\n✅ Successfully generated {num_clusters:,} clusters for {dataset_name}")
    print(f"   Saved to: {output_path}\n")
    
    return output_path


def generate_all_datasets(
    num_clusters_per_dataset=1500000,
    output_dir="generated_data_corrected",
    datasets_to_generate=None,
    use_false_copies=True,
    use_noise_deviation=True
):
    """
    Generate synthetic data for ALL or selected datasets.
    
    Parameters:
    -----------
    num_clusters_per_dataset : int
        Number of clusters to generate per dataset
    output_dir : str
        Directory to save generated data
    datasets_to_generate : list, optional
        List of dataset names to generate. If None, generates all.
    use_false_copies : bool
        Whether to inject false copies
    use_noise_deviation : bool
        Whether to vary error rates
    
    Returns:
    --------
    dict : Mapping of dataset names to output file paths
    """
    if datasets_to_generate is None:
        datasets_to_generate = list(DATASET_CONFIGS.keys())
    
    print("\n" + "="*70)
    print(" DNAformer Synthetic Data Generator - ALL CONFIGURATIONS")
    print("="*70)
    print(f"\nWill generate {num_clusters_per_dataset:,} clusters for each of:")
    for name in datasets_to_generate:
        print(f"  - {name}: {DATASET_CONFIGS[name]['description']}")
    print(f"\nOutput directory: {output_dir}")
    print("="*70)
    
    generated_files = {}
    
    for dataset_name in datasets_to_generate:
        try:
            output_path = generate_synthetic_dataset(
                dataset_name=dataset_name,
                num_clusters=num_clusters_per_dataset,
                output_dir=output_dir,
                use_false_copies=use_false_copies,
                use_noise_deviation=use_noise_deviation
            )
            generated_files[dataset_name] = output_path
        except Exception as e:
            print(f"❌ Error generating {dataset_name}: {e}")
            generated_files[dataset_name] = None
    
    # Print summary
    print("\n" + "="*70)
    print(" GENERATION COMPLETE - SUMMARY")
    print("="*70)
    for name, path in generated_files.items():
        status = "✅" if path else "❌"
        print(f"  {status} {name}: {path or 'FAILED'}")
    print("="*70)
    
    return generated_files


# ============================================================================
# COMPARISON TABLE
# ============================================================================

def print_all_configurations():
    """Print a summary table of all dataset configurations."""
    print("\n" + "="*100)
    print(" ALL DATASET CONFIGURATIONS")
    print("="*100)
    print(f"{'Dataset':<25} {'Label':<7} {'Dev':<5} {'Idx':<5} {'Sequencing':<20} {'Synthesis':<20} {'Method'}")
    print("-"*100)
    
    for name, config in DATASET_CONFIGS.items():
        print(f"{name:<25} {config['label_length']:<7} {config['max_deviation']:<5} "
              f"{config['index_length']:<5} {config['sequencing_tech']:<20} "
              f"{config['synthesis_tech']:<20} {config['error_rates_method']}")
    print("="*100)


# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    
    # Print all available configurations
    print_all_configurations()
    
    # ==== CONFIGURATION ====
    # Modify these parameters as needed
    
    NUM_CLUSTERS = 1500000  # Start smaller for testing, use 100k+ for training
    OUTPUT_DIR = "generated_data_corrected"
    
    # Choose which datasets to generate:
    # Option 1: Generate ALL datasets
    # DATASETS = None
    
    # Option 2: Generate only public benchmarks
    DATASETS = ["Erlich", "Grass", "Organick", "Srinivasavaradhan"]
    
    # Option 3: Generate only DNAformer's own datasets
    # DATASETS = ["DNAformer_Illumina_Full", "DNAformer_Nanopore_Full"]
    
    # Option 4: Generate a single dataset
    # DATASETS = ["Erlich"]
    
    # ==== GENERATE ====
    
    generated = generate_all_datasets(
        num_clusters_per_dataset=NUM_CLUSTERS,
        output_dir=OUTPUT_DIR,
        datasets_to_generate=None,
        use_false_copies=True,      # Highly recommended
        use_noise_deviation=True,   # Highly recommended
    )
    
    # ==== USAGE SUMMARY ====
    
    print("\n" + "="*70)
    print(" NEXT STEPS")
    print("="*70)
    print("""
To use the generated data for training, update your training config:

For Erlich dataset:
    TRAIN_FILE = "generated_data_corrected/binned_synthetic_erlich.txt"
    LABEL_SEQ_LEN = 152
    MAX_READ_LEN = 162  # label_length + max_deviation

For Grass dataset:
    TRAIN_FILE = "generated_data_corrected/binned_synthetic_grass.txt"
    LABEL_SEQ_LEN = 117
    MAX_READ_LEN = 128  # 117 + 11

For Organick dataset:
    TRAIN_FILE = "generated_data_corrected/binned_synthetic_organick.txt"
    LABEL_SEQ_LEN = 110
    MAX_READ_LEN = 115  # 110 + 5

For DNAformer Nanopore:
    TRAIN_FILE = "generated_data_corrected/binned_synthetic_dnaformer_nanopore_full.txt"
    LABEL_SEQ_LEN = 140
    MAX_READ_LEN = 144  # 140 + 4
""")


 ALL DATASET CONFIGURATIONS
Dataset                   Label   Dev   Idx   Sequencing           Synthesis            Method
----------------------------------------------------------------------------------------------------
Erlich                    152     10    16    Ilumina miSeq        Twist Bioscience     set_EZ17_values
Grass                     117     11    13    Ilumina miSeq        CustomArray          set_G15_values
Organick                  110     5     33    Ilumina NextSeq      Twist Bioscience     set_O17_values
Srinivasavaradhan         110     5     4     MinION               Twist Bioscience     set_R21_values
DNAformer_Illumina_Full   140     4     12    Ilumina miSeq-0922   Twist Bioscience-0922 set_BOS22_values
DNAformer_Illumina_Pilot  140     4     12    Ilumina miSeq        Twist Bioscience     set_EZ17_values
DNAformer_Nanopore_Pilot  140     10    12    MinIONShort          Twist Bioscience     set_B22_values
DNAformer_Nanopore_Full   140     4     12    Nan

Generating Erlich: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500000/1500000 [1:19:42<00:00, 313.67it/s]



✅ Successfully generated 1,500,000 clusters for Erlich
   Saved to: generated_data_corrected/binned_synthetic_erlich.txt


GENERATING: Grass
Description: Grass et al. (2015) - CustomArray + Illumina miSeq

Dataset Parameters:
  Label length:     117 bp
  Max deviation:    11 bp
  Max read length:  128 bp
  Index length:     13 bp
  Sequencing:       Ilumina miSeq
  Synthesis:        CustomArray
  Error rates:      set_G15_values()

Augmentation Settings:
  Noise deviation (δ): 0.1 (enabled)
  False copies:        enabled (30% prob, max 2)
  Cluster size:        1-16

Output: generated_data_corrected/binned_synthetic_grass.txt
Clusters to generate: 1,500,000


Generating Grass: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500000/1500000 [1:04:11<00:00, 389.50it/s]



✅ Successfully generated 1,500,000 clusters for Grass
   Saved to: generated_data_corrected/binned_synthetic_grass.txt


GENERATING: Organick
Description: Organick et al. (2018) - Twist + Illumina NextSeq

Dataset Parameters:
  Label length:     110 bp
  Max deviation:    5 bp
  Max read length:  115 bp
  Index length:     33 bp
  Sequencing:       Ilumina NextSeq
  Synthesis:        Twist Bioscience
  Error rates:      set_O17_values()

Augmentation Settings:
  Noise deviation (δ): 0.1 (enabled)
  False copies:        enabled (30% prob, max 2)
  Cluster size:        1-16

Output: generated_data_corrected/binned_synthetic_organick.txt
Clusters to generate: 1,500,000


Generating Organick: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500000/1500000 [58:00<00:00, 430.98it/s]



✅ Successfully generated 1,500,000 clusters for Organick
   Saved to: generated_data_corrected/binned_synthetic_organick.txt


GENERATING: Srinivasavaradhan
Description: Srinivasavaradhan et al. (Pfitser) - Twist + MinION

Dataset Parameters:
  Label length:     110 bp
  Max deviation:    5 bp
  Max read length:  115 bp
  Index length:     4 bp
  Sequencing:       MinION
  Synthesis:        Twist Bioscience
  Error rates:      set_R21_values()

Augmentation Settings:
  Noise deviation (δ): 0.1 (enabled)
  False copies:        enabled (30% prob, max 2)
  Cluster size:        1-16

Output: generated_data_corrected/binned_synthetic_srinivasavaradhan.txt
Clusters to generate: 1,500,000


Generating Srinivasavaradhan: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500000/1500000 [1:09:28<00:00, 359.84it/s]



✅ Successfully generated 1,500,000 clusters for Srinivasavaradhan
   Saved to: generated_data_corrected/binned_synthetic_srinivasavaradhan.txt


GENERATING: DNAformer_Illumina_Full
Description: DNAformer Full Illumina (2022) - Twist + miSeq

Dataset Parameters:
  Label length:     140 bp
  Max deviation:    4 bp
  Max read length:  144 bp
  Index length:     12 bp
  Sequencing:       Ilumina miSeq-0922
  Synthesis:        Twist Bioscience-0922
  Error rates:      set_BOS22_values()

Augmentation Settings:
  Noise deviation (δ): 0.1 (enabled)
  False copies:        enabled (30% prob, max 2)
  Cluster size:        1-16

Output: generated_data_corrected/binned_synthetic_dnaformer_illumina_full.txt
Clusters to generate: 1,500,000


Generating DNAformer_Illumina_Full: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500000/1500000 [1:11:02<00:00, 351.90it/s]



✅ Successfully generated 1,500,000 clusters for DNAformer_Illumina_Full
   Saved to: generated_data_corrected/binned_synthetic_dnaformer_illumina_full.txt


GENERATING: DNAformer_Illumina_Pilot
Description: DNAformer Pilot Illumina - Twist + miSeq

Dataset Parameters:
  Label length:     140 bp
  Max deviation:    4 bp
  Max read length:  144 bp
  Index length:     12 bp
  Sequencing:       Ilumina miSeq
  Synthesis:        Twist Bioscience
  Error rates:      set_EZ17_values()

Augmentation Settings:
  Noise deviation (δ): 0.1 (enabled)
  False copies:        enabled (30% prob, max 2)
  Cluster size:        1-16

Output: generated_data_corrected/binned_synthetic_dnaformer_illumina_pilot.txt
Clusters to generate: 1,500,000


Generating DNAformer_Illumina_Pilot: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500000/1500000 [1:12:18<00:00, 345.74it/s]



✅ Successfully generated 1,500,000 clusters for DNAformer_Illumina_Pilot
   Saved to: generated_data_corrected/binned_synthetic_dnaformer_illumina_pilot.txt


GENERATING: DNAformer_Nanopore_Pilot
Description: DNAformer Pilot Nanopore - Twist + MinION Short

Dataset Parameters:
  Label length:     140 bp
  Max deviation:    10 bp
  Max read length:  150 bp
  Index length:     12 bp
  Sequencing:       MinIONShort
  Synthesis:        Twist Bioscience
  Error rates:      set_B22_values()

Augmentation Settings:
  Noise deviation (δ): 0.1 (enabled)
  False copies:        enabled (30% prob, max 2)
  Cluster size:        1-16

Output: generated_data_corrected/binned_synthetic_dnaformer_nanopore_pilot.txt
Clusters to generate: 1,500,000


Generating DNAformer_Nanopore_Pilot: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500000/1500000 [1:23:23<00:00, 299.78it/s]



✅ Successfully generated 1,500,000 clusters for DNAformer_Nanopore_Pilot
   Saved to: generated_data_corrected/binned_synthetic_dnaformer_nanopore_pilot.txt


GENERATING: DNAformer_Nanopore_Full
Description: DNAformer Full Nanopore - Twist + MinION (with multipliers)

Dataset Parameters:
  Label length:     140 bp
  Max deviation:    4 bp
  Max read length:  144 bp
  Index length:     12 bp
  Sequencing:       Nanopore_pilot_v2_multi
  Synthesis:        Twist Bioscience_nanopore_pilot_v2_multi
  Error rates:      set_BOS22PILOTOMER_values_multi()

Augmentation Settings:
  Noise deviation (δ): 0.1 (enabled)
  False copies:        enabled (30% prob, max 2)
  Cluster size:        1-16

Output: generated_data_corrected/binned_synthetic_dnaformer_nanopore_full.txt
Clusters to generate: 1,500,000
  Applied noise coefficients: {'del_mult': 1.25, 'ins_mult': 1.25, 'sub_mult': 1.25}


Generating DNAformer_Nanopore_Full: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500000/1500000 [1:29:19<00:00, 279.88it/s]


✅ Successfully generated 1,500,000 clusters for DNAformer_Nanopore_Full
   Saved to: generated_data_corrected/binned_synthetic_dnaformer_nanopore_full.txt


 GENERATION COMPLETE - SUMMARY
  ✅ Erlich: generated_data_corrected/binned_synthetic_erlich.txt
  ✅ Grass: generated_data_corrected/binned_synthetic_grass.txt
  ✅ Organick: generated_data_corrected/binned_synthetic_organick.txt
  ✅ Srinivasavaradhan: generated_data_corrected/binned_synthetic_srinivasavaradhan.txt
  ✅ DNAformer_Illumina_Full: generated_data_corrected/binned_synthetic_dnaformer_illumina_full.txt
  ✅ DNAformer_Illumina_Pilot: generated_data_corrected/binned_synthetic_dnaformer_illumina_pilot.txt
  ✅ DNAformer_Nanopore_Pilot: generated_data_corrected/binned_synthetic_dnaformer_nanopore_pilot.txt
  ✅ DNAformer_Nanopore_Full: generated_data_corrected/binned_synthetic_dnaformer_nanopore_full.txt

 NEXT STEPS

To use the generated data for training, update your training config:

For Erlich dataset:
    TRAIN_FILE = "gener


