In [13]:
import math
import time
from Bio.Seq import Seq
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
import numpy as np
import sys

sys.path.insert(0, '..')  
from src.config import KNOWN_RBS_MOTIFS, START_CODONS, STOP_CODONS,  MIN_ORF_LENGTH, LENGTH_REFERENCE_BP
from src.config import SCORE_WEIGHTS, START_CODON_WEIGHTS, START_SELECTION_WEIGHTS, FIRST_FILTER_THRESHOLD, SECOND_FILTER_THRESHOLD
from functools import lru_cache


# ORIGINAL CODE FROM VERSION 1.0

In [14]:

def find_purine_rich_regions(
    sequence: str, 
    min_length: int = 4, 
    min_purine_content: float = 0.6
) -> List[Dict]:
    """Find purine-rich regions (A and G rich) in sequence."""
    purine_regions = []
    
    for start in range(len(sequence)):
        for length in range(min_length, min(9, len(sequence) - start + 1)):
            subseq = sequence[start:start + length]
            
            purines = subseq.count('A') + subseq.count('G')
            purine_fraction = purines / length
            
            if purine_fraction >= min_purine_content:
                purine_regions.append({
                    'sequence': subseq,
                    'start': start,
                    'end': start + length,
                    'purine_content': purine_fraction,
                    'length': length
                })
    
    return purine_regions


def evaluate_spacing_score(spacing: int) -> float:
    """Evaluate spacing between RBS and start codon (optimal: 6-10 nt)."""
    if 6 <= spacing <= 8:
        return 3.0  # Optimal
    elif 5 <= spacing <= 10:
        return 2.5  # Very good
    elif 4 <= spacing <= 12:
        return 1.5  # Good
    elif 3 <= spacing <= 14:
        return 1.0  # Acceptable
    else:
        return 0.2  # Poor


def score_motif_similarity(sequence: str) -> Tuple[float, str]:
    """Score sequence similarity to known RBS motifs."""
    best_score = 0.0
    best_motif = None
    
    for motif in KNOWN_RBS_MOTIFS:
        for offset in range(max(len(sequence), len(motif))):
            matches = 0
            total_positions = 0
            
            for i in range(len(sequence)):
                motif_pos = i + offset
                if 0 <= motif_pos < len(motif):
                    total_positions += 1
                    if sequence[i] == motif[motif_pos]:
                        matches += 1
            
            if total_positions > 0:
                similarity = matches / total_positions
                
                overlap_length = total_positions
                motif_weight = len(motif) / 6.0  # AGGAGG gets weight 1.0
                
                score = similarity * overlap_length * motif_weight
                
                if score > best_score:
                    best_score = score
                    best_motif = motif
    
    return best_score, best_motif


def predict_rbs_simple(sequence: str, orf: Dict, upstream_length: int = 20) -> Dict:
    """Predict RBS using purine content, spacing, and motif similarity."""
    start_pos = orf['start']
    
    if start_pos < upstream_length:
        return {
            'rbs_score': -5.0,
            'spacing_score': 0.0,
            'motif_score': 0.0,
            'best_sequence': None,
            'best_motif': None,
            'spacing': 0,
            'position': 0
        }

    upstream_start = start_pos - upstream_length
    upstream_seq = sequence[upstream_start:start_pos]
    
    purine_regions = find_purine_rich_regions(upstream_seq, min_length=4, min_purine_content=0.6)
    
    best_score = -5.0
    best_prediction = None
    
    for region in purine_regions:
        sd_candidate = region['sequence']
        spacing = len(upstream_seq) - region['end']
        
        if spacing < 4 or spacing > 12:
            continue
        
        spacing_score = evaluate_spacing_score(spacing)
        motif_score, best_motif = score_motif_similarity(sd_candidate)
        purine_bonus = (region['purine_content'] - 0.6) * 2.0
        

        combined_score = (
            spacing_score * 2.0 +    
            motif_score * 1.5 +      
            purine_bonus             
        )
        
        if combined_score > best_score:
            best_score = combined_score
            best_prediction = {
                'rbs_score': combined_score,
                'spacing_score': spacing_score,
                'motif_score': motif_score,
                'best_sequence': sd_candidate,
                'best_motif': best_motif,
                'spacing': spacing,
                'position': region['start'],
                'purine_content': region['purine_content'],
                'length': region['length']
            }
    
    return best_prediction or {
        'rbs_score': -5.0,
        'spacing_score': 0.0,
        'motif_score': 0.0,
        'best_sequence': None,
        'best_motif': None,
        'spacing': 0,
        'position': 0
    }


# =============================================================================
# ORF DETECTION
# =============================================================================

def find_orfs_candidates(sequence: str, min_length: int = 100) -> List[Dict]:
    """Detect all ORF candidates with dual coordinates and RBS scores."""
    orfs = []
    
    sequences = [
        ('forward', sequence),
        ('reverse', str(Seq(sequence).reverse_complement()))
    ]
    seq_len = len(sequence)

    print("Detecting ORFs and calculating RBS...")

    for strand_name, seq in sequences:
        for frame in range(3):
            active_starts = [] 
            
            for i in range(frame, len(seq) - 2, 3):
                codon = seq[i:i+3]
                
                if len(codon) != 3:
                    break
                
                if codon in START_CODONS:
                    active_starts.append((i, codon))
                    
                elif codon in STOP_CODONS and active_starts:
                    for start_pos, start_codon in active_starts:
                        orf_length = i + 3 - start_pos
                        
                        if orf_length >= min_length:
                            # Create ORF with dual coordinates
                            if strand_name == 'forward':
                                orf = {
                                    'start': start_pos + 1,
                                    'end': i + 3,
                                    'genome_start': start_pos + 1,
                                    'genome_end': i + 3,
                                    'length': orf_length,
                                    'frame': frame,
                                    'strand': 'forward',
                                    'start_codon': start_codon,
                                    'sequence': seq[start_pos:i+3]
                                }
                            else:  # reverse strand
                                orf = {
                                    'start': start_pos + 1,
                                    'end': i + 3,
                                    'genome_start': seq_len - (i + 3) + 1,
                                    'genome_end': seq_len - start_pos,
                                    'length': orf_length,
                                    'frame': frame,
                                    'strand': 'reverse',
                                    'start_codon': start_codon,
                                    'sequence': seq[start_pos:i+3]
                                }
                            
                            # Calculate RBS for this ORF
                            rbs_result = predict_rbs_simple(seq, orf, upstream_length=20)
                            orf['rbs_score'] = rbs_result['rbs_score']
                            orf['rbs_motif'] = rbs_result.get('best_motif')
                            orf['rbs_spacing'] = rbs_result.get('spacing', 0)
                            orf['rbs_sequence'] = rbs_result.get('best_sequence')
                            
                            orfs.append(orf)
                    
                    active_starts = []
    
    print(f"Complete: {len(orfs):,} ORFs detected with RBS scores")
    return orfs


# NEW CODE

In [15]:
from functools import lru_cache

# =============================================================================
# RBS (RIBOSOME BINDING SITE) PREDICTION
# =============================================================================

def find_purine_rich_regions_new(
    sequence: str, 
    min_length: int = 4, 
    min_purine_content: float = 0.6
) -> List[Dict]:
    """Find purine-rich regions using sliding window optimization."""
    purine_regions = []
    seq_len = len(sequence)
    
    if seq_len < min_length:
        return purine_regions
    
    is_purine = [1 if base in 'AG' else 0 for base in sequence]
    
    for start in range(seq_len):
        max_length = min(9, seq_len - start + 1)
        
        if max_length > min_length:
            purine_count = sum(is_purine[start:start + min_length])
            
            length = min_length
            if length <= seq_len - start:
                purine_fraction = purine_count / length
                if purine_fraction >= min_purine_content:
                    purine_regions.append({
                        'sequence': sequence[start:start + length],
                        'start': start,
                        'end': start + length,
                        'purine_content': purine_fraction,
                        'length': length
                    })
            
            for length in range(min_length + 1, max_length):
                if start + length > seq_len:
                    break
                
                purine_count += is_purine[start + length - 1]
                
                purine_fraction = purine_count / length
                if purine_fraction >= min_purine_content:
                    purine_regions.append({
                        'sequence': sequence[start:start + length],
                        'start': start,
                        'end': start + length,
                        'purine_content': purine_fraction,
                        'length': length
                    })
    
    return purine_regions


@lru_cache(maxsize=10000)
def score_motif_similarity_new(sequence: str) -> Tuple[float, str]:
    """Score sequence similarity to known RBS motifs."""
    best_score = 0.0
    best_motif = None
    
    for motif in KNOWN_RBS_MOTIFS:
        for offset in range(max(len(sequence), len(motif))):
            matches = 0
            total_positions = 0
            
            for i in range(len(sequence)):
                motif_pos = i + offset
                if 0 <= motif_pos < len(motif):
                    total_positions += 1
                    if sequence[i] == motif[motif_pos]:
                        matches += 1
            
            if total_positions > 0:
                similarity = matches / total_positions
                
                overlap_length = total_positions
                motif_weight = len(motif) / 6.0  
                
                score = similarity * overlap_length * motif_weight
                
                if score > best_score:
                    best_score = score
                    best_motif = motif
    
    return best_score, best_motif


def predict_rbs_simple_new(sequence: str, orf: Dict, upstream_length: int = 20) -> Dict:
    """Predict RBS using purine content, spacing, and motif similarity."""
    start_pos = orf['start']
    
    if start_pos < upstream_length:
        return {
            'rbs_score': -5.0,
            'spacing_score': 0.0,
            'motif_score': 0.0,
            'best_sequence': None,
            'best_motif': None,
            'spacing': 0,
            'position': 0
        }

    upstream_start = start_pos - upstream_length
    upstream_seq = sequence[upstream_start:start_pos]
    
    purine_regions = find_purine_rich_regions_new(upstream_seq, min_length=4, min_purine_content=0.6)
    
    best_score = -5.0
    best_prediction = None
    
    for region in purine_regions:
        sd_candidate = region['sequence']
        spacing = len(upstream_seq) - region['end']
        
        if spacing < 4 or spacing > 12:
            continue
        elif 6 <= spacing <= 8:
            spacing_score= 3.0  # Optimal
        elif 5 <= spacing <= 10:
            spacing_score= 2.5  # good
        elif 4 <= spacing <= 12:
            spacing_score= 1.5  # ok
        
        motif_score, best_motif = score_motif_similarity_new(sd_candidate)
        purine_bonus = (region['purine_content'] - 0.6) * 2.0
        

        combined_score = (
            spacing_score * 2.0 +    
            motif_score * 1.5 +      
            purine_bonus             
        )
        
        if combined_score > best_score:
            best_score = combined_score
            best_prediction = {
                'rbs_score': combined_score,
                'spacing_score': spacing_score,
                'motif_score': motif_score,
                'best_sequence': sd_candidate,
                'best_motif': best_motif,
                'spacing': spacing,
                'position': region['start'],
                'purine_content': region['purine_content'],
                'length': region['length']
            }
    
    return best_prediction or {
        'rbs_score': -5.0,
        'spacing_score': 0.0,
        'motif_score': 0.0,
        'best_sequence': None,
        'best_motif': None,
        'spacing': 0,
        'position': 0
    }


# =============================================================================
# ORF DETECTION
# =============================================================================

def find_orfs_candidates_new(sequence: str, min_length: int = 100) -> List[Dict]:
    """Detect all ORF candidates with dual coordinates and RBS scores."""
    
    if hasattr(score_motif_similarity_new, 'cache_clear'):
        score_motif_similarity_new.cache_clear()
    
    orfs = []
    
    reverse_seq = str(Seq(sequence).reverse_complement())
    
    sequences = [
        ('forward', sequence),
        ('reverse', reverse_seq)
    ]
    seq_len = len(sequence)

    print("Detecting ORFs and calculating RBS...")

    for strand_name, seq in sequences:
        for frame in range(3):
            active_starts = [] 
            
            for i in range(frame, len(seq) - 2, 3):
                codon = seq[i:i+3]
                
                if len(codon) != 3:
                    break
                
                if codon in START_CODONS:
                    active_starts.append((i, codon))
                    
                elif codon in STOP_CODONS and active_starts:
                    for start_pos, start_codon in active_starts:
                        orf_length = i + 3 - start_pos
                        
                        if orf_length >= min_length:
                            # Create ORF with dual coordinates
                            if strand_name == 'forward':
                                orf = {
                                    'start': start_pos + 1,
                                    'end': i + 3,
                                    'genome_start': start_pos + 1,
                                    'genome_end': i + 3,
                                    'length': orf_length,
                                    'frame': frame,
                                    'strand': 'forward',
                                    'start_codon': start_codon,
                                    'sequence': seq[start_pos:i+3]
                                }
                            else:  # reverse strand
                                orf = {
                                    'start': start_pos + 1,
                                    'end': i + 3,
                                    'genome_start': seq_len - (i + 3) + 1,
                                    'genome_end': seq_len - start_pos,
                                    'length': orf_length,
                                    'frame': frame,
                                    'strand': 'reverse',
                                    'start_codon': start_codon,
                                    'sequence': seq[start_pos:i+3]
                                }
                            
                            # Calculate RBS for this ORF
                            rbs_result = predict_rbs_simple_new(seq, orf, upstream_length=20)
                            orf['rbs_score'] = rbs_result['rbs_score']
                            orf['rbs_motif'] = rbs_result.get('best_motif')
                            orf['rbs_spacing'] = rbs_result.get('spacing', 0)
                            orf['rbs_sequence'] = rbs_result.get('best_sequence')
                            
                            orfs.append(orf)
                    
                    active_starts = []
    
    print(f"Complete: {len(orfs):,} ORFs detected with RBS scores")
    return orfs

# ============================================================================
# SIMPLE COMPARISON: Original vs Optimized
# ============================================================================


In [None]:
# ============================================================================
# MULTI-GENOME OPTIMIZATION TEST (Old vs New Comparison)
# ============================================================================

from Bio import Entrez, SeqIO
import time
import gc
from pathlib import Path

# Import config
from src.config import TEST_GENOMES

# Import OPTIMIZED version from production
from src.traditional_methods import find_orfs_candidates

# The OLD version (find_orfs_candidates_new) should already be defined in your notebook
# Make sure you have find_orfs_candidates_new with the OLD (unoptimized) code

# CRITICAL: Set your email for NCBI
Entrez.email = "your.email@example.com"  # CHANGE THIS!

print("="*80)
print("OPTIMIZATION COMPARISON TEST")
print("="*80)
print(f"Testing {len(TEST_GENOMES)} genomes")
print("Comparing: OLD (baseline) vs NEW (optimized)\n")

# Ensure data directory exists
data_dir = Path('data/full_dataset')
data_dir.mkdir(parents=True, exist_ok=True)

results = []

for genome_id in TEST_GENOMES:
    fasta_path = data_dir / f'{genome_id}.fasta'
    
    print(f"\n{'='*80}")
    print(f"Genome: {genome_id}")
    print("="*80)
    
    # Download if not exists
    if not fasta_path.exists():
        print(f"Downloading {genome_id} from NCBI...")
        try:
            handle = Entrez.efetch(
                db="nucleotide",
                id=genome_id,
                rettype="fasta",
                retmode="text"
            )
            
            with open(fasta_path, 'w') as f:
                fasta_content = handle.read()
                f.write(fasta_content)
            handle.close()
            
            file_size = fasta_path.stat().st_size
            print(f"  Downloaded: {file_size:,} bytes")
            
            if file_size < 100:
                print(f"  [ERROR] File too small, download may have failed")
                results.append({
                    'genome': genome_id,
                    'size': 0,
                    'time_old': 0,
                    'time_new': 0,
                    'orfs_old': 0,
                    'orfs_new': 0,
                    'match': False,
                    'status': 'DOWNLOAD FAILED'
                })
                continue
                
        except Exception as e:
            print(f"  [ERROR] Download failed: {e}")
            results.append({
                'genome': genome_id,
                'size': 0,
                'time_old': 0,
                'time_new': 0,
                'orfs_old': 0,
                'orfs_new': 0,
                'match': False,
                'status': f'DOWNLOAD ERROR'
            })
            continue
    else:
        print(f"Using cached: {fasta_path}")
    
    try:
        # Load genome
        record = SeqIO.read(fasta_path, "fasta")
        sequence = str(record.seq)
        print(f"Sequence: {len(sequence):,} bp")
        
        # Test OLD (baseline) version from notebook
        print("\n[1/2] Testing OLD (baseline)...", end=" ", flush=True)
        gc.collect()
        start = time.perf_counter()
        orfs_old = find_orfs_candidates(sequence, min_length=100)
        time_old = time.perf_counter() - start
        print(f"{time_old:.2f}s - {len(orfs_old):,} ORFs")
        
        # Test NEW (optimized) version from production
        print("[2/2] Testing NEW (optimized)...", end=" ", flush=True)
        gc.collect()
        start = time.perf_counter()
        orfs_new = find_orfs_candidates_new(sequence, min_length=100)
        time_new = time.perf_counter() - start
        print(f"{time_new:.2f}s - {len(orfs_new):,} ORFs")
        
        # Verify results match
        orfs_match = len(orfs_old) == len(orfs_new)
        
        # Check RBS scores match (sample first 100)
        rbs_match = True
        if orfs_match:
            sample_size = min(100, len(orfs_old))
            for i in range(sample_size):
                if orfs_old[i].get('rbs_score') != orfs_new[i].get('rbs_score'):
                    rbs_match = False
                    break
        
        match = orfs_match and rbs_match
        
        # Calculate improvement
        saved = time_old - time_new
        percent = (saved / time_old * 100) if time_old > 0 else 0
        
        print(f"\nResults match: {'YES' if match else 'NO'}")
        print(f"Time saved: {saved:.2f}s ({percent:.1f}%)")
        
        # Cache stats
        try:
            from src.traditional_methods import score_motif_similarity
            if hasattr(score_motif_similarity, 'cache_info'):
                cache_info = score_motif_similarity.cache_info()
                if cache_info.hits + cache_info.misses > 0:
                    hit_rate = cache_info.hits / (cache_info.hits + cache_info.misses) * 100
                    print(f"Cache hit rate: {hit_rate:.1f}%")
        except:
            pass
        
        results.append({
            'genome': genome_id,
            'size': len(sequence),
            'time_old': time_old,
            'time_new': time_new,
            'orfs_old': len(orfs_old),
            'orfs_new': len(orfs_new),
            'saved': saved,
            'percent': percent,
            'match': match,
            'status': 'SUCCESS' if match else 'MISMATCH'
        })
        
    except Exception as e:
        print(f"  [ERROR] Processing failed: {e}")
        import traceback
        traceback.print_exc()
        results.append({
            'genome': genome_id,
            'size': 0,
            'time_old': 0,
            'time_new': 0,
            'orfs_old': 0,
            'orfs_new': 0,
            'saved': 0,
            'percent': 0,
            'match': False,
            'status': 'PROCESSING ERROR'
        })

# ============================================================================
# SUMMARY
# ============================================================================

print(f"\n{'='*80}")
print("SUMMARY")
print("="*80)

print(f"\n{'Genome':<20} {'Size (bp)':<12} {'Old (s)':<10} {'New (s)':<10} {'Saved':<10} {'%':<8} {'Match':<8} {'Status'}")
print("-"*110)

for r in results:
    size_str = f"{r['size']:,}" if r['size'] > 0 else "-"
    old_str = f"{r['time_old']:.2f}" if r['time_old'] > 0 else "-"
    new_str = f"{r['time_new']:.2f}" if r['time_new'] > 0 else "-"
    saved_str = f"{r['saved']:.2f}" if r.get('saved', 0) != 0 else "-"
    percent_str = f"{r['percent']:.1f}%" if r.get('percent', 0) != 0 else "-"
    match_str = "✓" if r['match'] else "✗"
    
    status_display = "✓ OK" if r['status'] == 'SUCCESS' else f"✗ {r['status']}"
    print(f"{r['genome']:<20} {size_str:>11} {old_str:>9} {new_str:>9} {saved_str:>9} {percent_str:>7} {match_str:^8} {status_display}")

# Calculate statistics for successful runs
successful = [r for r in results if r['status'] == 'SUCCESS']
failed = [r for r in results if r['status'] != 'SUCCESS']

if successful:
    total_time_old = sum(r['time_old'] for r in successful)
    total_time_new = sum(r['time_new'] for r in successful)
    total_saved = sum(r['saved'] for r in successful)
    avg_percent = sum(r['percent'] for r in successful) / len(successful)
    
    print(f"\nStatistics ({len(successful)} successful):")
    print(f"  Total time OLD: {total_time_old:.2f}s")
    print(f"  Total time NEW: {total_time_new:.2f}s")
    print(f"  Total saved: {total_saved:.2f}s")
    print(f"  Average improvement: {avg_percent:.1f}%")

print(f"\n{'='*80}")
print("VERDICT")
print("="*80)

if failed:
    print(f"\n[WARNING] {len(failed)} genome(s) failed:")
    for r in failed:
        print(f"  - {r['genome']}: {r['status']}")

# Check if all successful runs have matching results
all_match = all(r['match'] for r in successful)

if len(successful) == len(TEST_GENOMES) and all_match:
    print("\n✓ All test genomes processed successfully")
    print("✓ All results identical between old and new")
    print("✓ Optimizations working correctly across all genomes")
    total_saved = sum(r['saved'] for r in successful)
    avg_improvement = sum(r['percent'] for r in successful) / len(successful)
    print(f"✓ Performance: {avg_improvement:.1f}% faster on average ({total_saved:.1f}s saved total)")
    print("\n[READY TO MERGE]")
elif len(successful) > 0 and all_match:
    print(f"\n[MOSTLY SUCCESS] {len(successful)}/{len(TEST_GENOMES)} genomes passed")
    print("✓ All successful runs have matching results")
    print("Some failures may be download issues - review above")
    print("\n[LIKELY READY TO MERGE - review failures]")
elif not all_match:
    print("\n✗ RESULTS DON'T MATCH - DO NOT MERGE")
    print("Old and new versions produce different results!")
    mismatched = [r for r in successful if not r['match']]
    for r in mismatched:
        print(f"  - {r['genome']}: {r['orfs_old']} vs {r['orfs_new']} ORFs")
else:
    print("\n✗ All genomes failed - DO NOT MERGE")
    print("Fix errors before proceeding")

print("="*80)

OPTIMIZATION COMPARISON TEST
Testing 15 genomes
Comparing: OLD (baseline) vs NEW (optimized)


Genome: NC_000913.3
Using cached: data\full_dataset\NC_000913.3.fasta
Sequence: 4,641,652 bp
[1/2] Testing NEW (optimized)... Detecting ORFs and calculating RBS...
Complete: 176,315 ORFs detected with RBS scores
24.60s - 176,315 ORFs

[2/2] Testing OLD (baseline)... Detecting ORFs and calculating RBS...
Complete: 176,315 ORFs detected with RBS scores
90.53s - 176,315 ORFs

Results match: YES
Time saved: -65.93s (-268.0%)

Genome: NC_000964.3
Using cached: data\full_dataset\NC_000964.3.fasta
Sequence: 4,215,606 bp
[1/2] Testing NEW (optimized)... Detecting ORFs and calculating RBS...
Complete: 139,046 ORFs detected with RBS scores
15.50s - 139,046 ORFs

[2/2] Testing OLD (baseline)... Detecting ORFs and calculating RBS...
Complete: 139,046 ORFs detected with RBS scores
50.94s - 139,046 ORFs

Results match: YES
Time saved: -35.44s (-228.6%)

Genome: NC_003197.2
Downloading NC_003197.2 from NCBI