In [None]:
import math
import time
from Bio.Seq import Seq
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
import numpy as np
from src.config import KNOWN_RBS_MOTIFS, START_CODONS, STOP_CODONS,  MIN_ORF_LENGTH, LENGTH_REFERENCE_BP
from src.config import SCORE_WEIGHTS, START_CODON_WEIGHTS, START_SELECTION_WEIGHTS, FIRST_FILTER_THRESHOLD, SECOND_FILTER_THRESHOLD
from src.traditional_methods import find_orfs_candidates
from functools import lru_cache


In [None]:

# =============================================================================
# RBS (RIBOSOME BINDING SITE) PREDICTION
# =============================================================================

def find_purine_rich_regions_new(
    sequence: str, 
    min_length: int = 4, 
    min_purine_content: float = 0.6
) -> List[Dict]:
    """Find purine-rich regions (A and G rich) in sequence."""
    purine_regions = []
    
    for start in range(len(sequence)):
        for length in range(min_length, min(9, len(sequence) - start + 1)):
            subseq = sequence[start:start + length]
            
            purines = subseq.count('A') + subseq.count('G')
            purine_fraction = purines / length
            
            if purine_fraction >= min_purine_content:
                purine_regions.append({
                    'sequence': subseq,
                    'start': start,
                    'end': start + length,
                    'purine_content': purine_fraction,
                    'length': length
                })
    
    return purine_regions


def evaluate_spacing_score_new(spacing: int) -> float:
    """Evaluate spacing between RBS and start codon (optimal: 6-10 nt)."""
    if 6 <= spacing <= 8:
        return 3.0  # Optimal
    elif 5 <= spacing <= 10:
        return 2.5  # Very good
    elif 4 <= spacing <= 12:
        return 1.5  # Good
    elif 3 <= spacing <= 14:
        return 1.0  # Acceptable
    else:
        return 0.2  # Poor


def score_motif_similarity_new(sequence: str) -> Tuple[float, str]:
    """Score sequence similarity to known RBS motifs."""
    best_score = 0.0
    best_motif = None
    
    for motif in KNOWN_RBS_MOTIFS:
        for offset in range(max(len(sequence), len(motif))):
            matches = 0
            total_positions = 0
            
            for i in range(len(sequence)):
                motif_pos = i + offset
                if 0 <= motif_pos < len(motif):
                    total_positions += 1
                    if sequence[i] == motif[motif_pos]:
                        matches += 1
            
            if total_positions > 0:
                similarity = matches / total_positions
                
                overlap_length = total_positions
                motif_weight = len(motif) / 6.0  # AGGAGG gets weight 1.0
                
                score = similarity * overlap_length * motif_weight
                
                if score > best_score:
                    best_score = score
                    best_motif = motif
    
    return best_score, best_motif


def predict_rbs_simple_new(sequence: str, orf: Dict, upstream_length: int = 20) -> Dict:
    """Predict RBS using purine content, spacing, and motif similarity."""
    start_pos = orf['start']
    
    if start_pos < upstream_length:
        return {
            'rbs_score': -5.0,
            'spacing_score': 0.0,
            'motif_score': 0.0,
            'best_sequence': None,
            'best_motif': None,
            'spacing': 0,
            'position': 0
        }

    upstream_start = start_pos - upstream_length
    upstream_seq = sequence[upstream_start:start_pos]
    
    purine_regions = find_purine_rich_regions_new(upstream_seq, min_length=4, min_purine_content=0.6)
    
    best_score = -5.0
    best_prediction = None
    
    for region in purine_regions:
        sd_candidate = region['sequence']
        spacing = len(upstream_seq) - region['end']
        
        if spacing < 4 or spacing > 12:
            continue
        elif 6 <= spacing <= 8:
            spacing_score= 3.0  # Optimal
        elif 5 <= spacing <= 10:
            spacing_score= 2.5  # Very good
        elif 4 <= spacing <= 12:
            spacing_score= 1.5  # Good
        
        motif_score, best_motif = score_motif_similarity_new(sd_candidate)
        purine_bonus = (region['purine_content'] - 0.6) * 2.0
        

        combined_score = (
            spacing_score * 2.0 +    
            motif_score * 1.5 +      
            purine_bonus             
        )
        
        if combined_score > best_score:
            best_score = combined_score
            best_prediction = {
                'rbs_score': combined_score,
                'spacing_score': spacing_score,
                'motif_score': motif_score,
                'best_sequence': sd_candidate,
                'best_motif': best_motif,
                'spacing': spacing,
                'position': region['start'],
                'purine_content': region['purine_content'],
                'length': region['length']
            }
    
    return best_prediction or {
        'rbs_score': -5.0,
        'spacing_score': 0.0,
        'motif_score': 0.0,
        'best_sequence': None,
        'best_motif': None,
        'spacing': 0,
        'position': 0
    }


# =============================================================================
# ORF DETECTION
# =============================================================================

def find_orfs_candidates_new(sequence: str, min_length: int = 100) -> List[Dict]:
    """Detect all ORF candidates with dual coordinates and RBS scores."""
    orfs = []
    
    sequences = [
        ('forward', sequence),
        ('reverse', str(Seq(sequence).reverse_complement()))
    ]
    seq_len = len(sequence)

    print("Detecting ORFs and calculating RBS...")

    for strand_name, seq in sequences:
        for frame in range(3):
            active_starts = [] 
            
            for i in range(frame, len(seq) - 2, 3):
                codon = seq[i:i+3]
                
                if len(codon) != 3:
                    break
                
                if codon in START_CODONS:
                    active_starts.append((i, codon))
                    
                elif codon in STOP_CODONS and active_starts:
                    for start_pos, start_codon in active_starts:
                        orf_length = i + 3 - start_pos
                        
                        if orf_length >= min_length:
                            # Create ORF with dual coordinates
                            if strand_name == 'forward':
                                orf = {
                                    'start': start_pos + 1,
                                    'end': i + 3,
                                    'genome_start': start_pos + 1,
                                    'genome_end': i + 3,
                                    'length': orf_length,
                                    'frame': frame,
                                    'strand': 'forward',
                                    'start_codon': start_codon,
                                    'sequence': seq[start_pos:i+3]
                                }
                            else:  # reverse strand
                                orf = {
                                    'start': start_pos + 1,
                                    'end': i + 3,
                                    'genome_start': seq_len - (i + 3) + 1,
                                    'genome_end': seq_len - start_pos,
                                    'length': orf_length,
                                    'frame': frame,
                                    'strand': 'reverse',
                                    'start_codon': start_codon,
                                    'sequence': seq[start_pos:i+3]
                                }
                            
                            # Calculate RBS for this ORF
                            rbs_result = predict_rbs_simple_new(seq, orf, upstream_length=20)
                            orf['rbs_score'] = rbs_result['rbs_score']
                            orf['rbs_motif'] = rbs_result.get('best_motif')
                            orf['rbs_spacing'] = rbs_result.get('spacing', 0)
                            orf['rbs_sequence'] = rbs_result.get('best_sequence')
                            
                            orfs.append(orf)
                    
                    active_starts = []
    
    print(f"Complete: {len(orfs):,} ORFs detected with RBS scores")
    return orfs


In [1]:
# ============================================================================
# NOTEBOOK TEST CELL: Compare Original vs Optimized ORF Detection
# ============================================================================
# Copy this into your Jupyter notebook to test optimizations

from Bio import Entrez, SeqIO
from pathlib import Path
import time

# Configuration
EMAIL = "your.email@example.com"  # CHANGE THIS
ACCESSION = "NC_000913.3"  # E. coli K-12 MG1655

# ============================================================================
# STEP 1: Download E. coli genome (only runs once)
# ============================================================================

data_dir = Path('test_data')
data_dir.mkdir(exist_ok=True)
fasta_path = data_dir / f'{ACCESSION}.fasta'

if not fasta_path.exists():
    print("Downloading E. coli genome...")
    Entrez.email = EMAIL
    handle = Entrez.efetch(db="nucleotide", id=ACCESSION, rettype="fasta", retmode="text")
    with open(fasta_path, 'w') as f:
        f.write(handle.read())
    handle.close()
    print(f"Downloaded: {fasta_path}")
else:
    print(f"Using cached genome: {fasta_path}")

# Load sequence
record = SeqIO.read(fasta_path, "fasta")
sequence = str(record.seq)
print(f"Sequence: {len(sequence):,} bp\n")

# ============================================================================
# STEP 2: Run both methods
# ============================================================================

print("="*80)
print("COMPARING: find_orfs_candidates() vs find_orfs_candidates_new()")
print("="*80)

# Original method
print("\n[1/2] Running ORIGINAL method...")
start = time.time()
orfs_original = find_orfs_candidates(sequence, min_length=100)
time_original = time.time() - start
print(f"      {len(orfs_original):,} ORFs in {time_original:.2f}s")

# Optimized method  
print("\n[2/2] Running OPTIMIZED method...")
start = time.time()
orfs_optimized = find_orfs_candidates_new(sequence, min_length=100)
time_optimized = time.time() - start
print(f"      {len(orfs_optimized):,} ORFs in {time_optimized:.2f}s")

# ============================================================================
# STEP 3: Compare results
# ============================================================================

print(f"\n{'='*80}")
print("VERIFICATION")
print("="*80)

# Check counts
print(f"\nORF Counts:")
print(f"  Original:  {len(orfs_original):,}")
print(f"  Optimized: {len(orfs_optimized):,}")

if len(orfs_original) != len(orfs_optimized):
    print("  [FAIL] Different counts!")
else:
    print("  [PASS] Counts match!")

# Check individual ORFs
print(f"\nComparing ORF details...")
all_match = True
mismatches = []

for i, (orig, opt) in enumerate(zip(orfs_original, orfs_optimized)):
    for key in orig.keys():
        if key not in opt:
            mismatches.append(f"ORF {i}: missing key '{key}'")
            all_match = False
        elif orig[key] != opt[key]:
            # Handle floating point comparison
            if isinstance(orig[key], float) and isinstance(opt[key], float):
                if abs(orig[key] - opt[key]) > 1e-10:
                    mismatches.append(f"ORF {i} '{key}': {orig[key]:.6f} != {opt[key]:.6f}")
                    all_match = False
            else:
                mismatches.append(f"ORF {i} '{key}': {orig[key]} != {opt[key]}")
                all_match = False

if all_match:
    print("  [PASS] All ORF fields match perfectly!")
else:
    print(f"  [FAIL] Found {len(mismatches)} mismatches!")
    print("\nFirst 5 mismatches:")
    for m in mismatches[:5]:
        print(f"    - {m}")

# ============================================================================
# FINAL RESULTS
# ============================================================================

print(f"\n{'='*80}")
if all_match and len(orfs_original) == len(orfs_optimized):
    print("SUCCESS: Optimization verified - results are IDENTICAL!")
    print("="*80)
    print("\nPerformance:")
    print(f"  Original:  {time_original:.2f}s")
    print(f"  Optimized: {time_optimized:.2f}s")
    
    if time_optimized < time_original:
        speedup = (time_original / time_optimized - 1) * 100
        print(f"  Speedup:   {speedup:.1f}% faster")
    elif time_optimized > time_original:
        slowdown = (time_optimized / time_original - 1) * 100
        print(f"  Note:      {slowdown:.1f}% slower (may vary by run)")
    else:
        print(f"  Note:      Same speed")
        
    print("\n[READY FOR COMMIT]")
else:
    print("FAILED: Results differ - DO NOT COMMIT!")
    print("="*80)
    print("\nFix the issues above before committing.")

Downloading E. coli genome...
Downloaded: test_data\NC_000913.3.fasta
Sequence: 4,641,652 bp

COMPARING: find_orfs_candidates() vs find_orfs_candidates_new()

[1/2] Running ORIGINAL method...


NameError: name 'find_orfs_candidates' is not defined