In [2]:
import math
import time
from Bio.Seq import Seq
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
import numpy as np
import sys

sys.path.insert(0, '..')  
from src.config import KNOWN_RBS_MOTIFS, START_CODONS, STOP_CODONS,  MIN_ORF_LENGTH, LENGTH_REFERENCE_BP
from src.config import SCORE_WEIGHTS, START_CODON_WEIGHTS, START_SELECTION_WEIGHTS, FIRST_FILTER_THRESHOLD, SECOND_FILTER_THRESHOLD
from src.traditional_methods import find_orfs_candidates
from functools import lru_cache


# ORIGINAL CODE FROM VERSION 1.0

In [None]:

def find_purine_rich_regions(
    sequence: str, 
    min_length: int = 4, 
    min_purine_content: float = 0.6
) -> List[Dict]:
    """Find purine-rich regions (A and G rich) in sequence."""
    purine_regions = []
    
    for start in range(len(sequence)):
        for length in range(min_length, min(9, len(sequence) - start + 1)):
            subseq = sequence[start:start + length]
            
            purines = subseq.count('A') + subseq.count('G')
            purine_fraction = purines / length
            
            if purine_fraction >= min_purine_content:
                purine_regions.append({
                    'sequence': subseq,
                    'start': start,
                    'end': start + length,
                    'purine_content': purine_fraction,
                    'length': length
                })
    
    return purine_regions


def evaluate_spacing_score(spacing: int) -> float:
    """Evaluate spacing between RBS and start codon (optimal: 6-10 nt)."""
    if 6 <= spacing <= 8:
        return 3.0  # Optimal
    elif 5 <= spacing <= 10:
        return 2.5  # Very good
    elif 4 <= spacing <= 12:
        return 1.5  # Good
    elif 3 <= spacing <= 14:
        return 1.0  # Acceptable
    else:
        return 0.2  # Poor


def score_motif_similarity(sequence: str) -> Tuple[float, str]:
    """Score sequence similarity to known RBS motifs."""
    best_score = 0.0
    best_motif = None
    
    for motif in KNOWN_RBS_MOTIFS:
        for offset in range(max(len(sequence), len(motif))):
            matches = 0
            total_positions = 0
            
            for i in range(len(sequence)):
                motif_pos = i + offset
                if 0 <= motif_pos < len(motif):
                    total_positions += 1
                    if sequence[i] == motif[motif_pos]:
                        matches += 1
            
            if total_positions > 0:
                similarity = matches / total_positions
                
                overlap_length = total_positions
                motif_weight = len(motif) / 6.0  # AGGAGG gets weight 1.0
                
                score = similarity * overlap_length * motif_weight
                
                if score > best_score:
                    best_score = score
                    best_motif = motif
    
    return best_score, best_motif


def predict_rbs_simple(sequence: str, orf: Dict, upstream_length: int = 20) -> Dict:
    """Predict RBS using purine content, spacing, and motif similarity."""
    start_pos = orf['start']
    
    if start_pos < upstream_length:
        return {
            'rbs_score': -5.0,
            'spacing_score': 0.0,
            'motif_score': 0.0,
            'best_sequence': None,
            'best_motif': None,
            'spacing': 0,
            'position': 0
        }

    upstream_start = start_pos - upstream_length
    upstream_seq = sequence[upstream_start:start_pos]
    
    purine_regions = find_purine_rich_regions(upstream_seq, min_length=4, min_purine_content=0.6)
    
    best_score = -5.0
    best_prediction = None
    
    for region in purine_regions:
        sd_candidate = region['sequence']
        spacing = len(upstream_seq) - region['end']
        
        if spacing < 4 or spacing > 12:
            continue
        
        spacing_score = evaluate_spacing_score(spacing)
        motif_score, best_motif = score_motif_similarity(sd_candidate)
        purine_bonus = (region['purine_content'] - 0.6) * 2.0
        

        combined_score = (
            spacing_score * 2.0 +    
            motif_score * 1.5 +      
            purine_bonus             
        )
        
        if combined_score > best_score:
            best_score = combined_score
            best_prediction = {
                'rbs_score': combined_score,
                'spacing_score': spacing_score,
                'motif_score': motif_score,
                'best_sequence': sd_candidate,
                'best_motif': best_motif,
                'spacing': spacing,
                'position': region['start'],
                'purine_content': region['purine_content'],
                'length': region['length']
            }
    
    return best_prediction or {
        'rbs_score': -5.0,
        'spacing_score': 0.0,
        'motif_score': 0.0,
        'best_sequence': None,
        'best_motif': None,
        'spacing': 0,
        'position': 0
    }


# =============================================================================
# ORF DETECTION
# =============================================================================

def find_orfs_candidates(sequence: str, min_length: int = 100) -> List[Dict]:
    """Detect all ORF candidates with dual coordinates and RBS scores."""
    orfs = []
    
    sequences = [
        ('forward', sequence),
        ('reverse', str(Seq(sequence).reverse_complement()))
    ]
    seq_len = len(sequence)

    print("Detecting ORFs and calculating RBS...")

    for strand_name, seq in sequences:
        for frame in range(3):
            active_starts = [] 
            
            for i in range(frame, len(seq) - 2, 3):
                codon = seq[i:i+3]
                
                if len(codon) != 3:
                    break
                
                if codon in START_CODONS:
                    active_starts.append((i, codon))
                    
                elif codon in STOP_CODONS and active_starts:
                    for start_pos, start_codon in active_starts:
                        orf_length = i + 3 - start_pos
                        
                        if orf_length >= min_length:
                            # Create ORF with dual coordinates
                            if strand_name == 'forward':
                                orf = {
                                    'start': start_pos + 1,
                                    'end': i + 3,
                                    'genome_start': start_pos + 1,
                                    'genome_end': i + 3,
                                    'length': orf_length,
                                    'frame': frame,
                                    'strand': 'forward',
                                    'start_codon': start_codon,
                                    'sequence': seq[start_pos:i+3]
                                }
                            else:  # reverse strand
                                orf = {
                                    'start': start_pos + 1,
                                    'end': i + 3,
                                    'genome_start': seq_len - (i + 3) + 1,
                                    'genome_end': seq_len - start_pos,
                                    'length': orf_length,
                                    'frame': frame,
                                    'strand': 'reverse',
                                    'start_codon': start_codon,
                                    'sequence': seq[start_pos:i+3]
                                }
                            
                            # Calculate RBS for this ORF
                            rbs_result = predict_rbs_simple(seq, orf, upstream_length=20)
                            orf['rbs_score'] = rbs_result['rbs_score']
                            orf['rbs_motif'] = rbs_result.get('best_motif')
                            orf['rbs_spacing'] = rbs_result.get('spacing', 0)
                            orf['rbs_sequence'] = rbs_result.get('best_sequence')
                            
                            orfs.append(orf)
                    
                    active_starts = []
    
    print(f"Complete: {len(orfs):,} ORFs detected with RBS scores")
    return orfs


# NEW CODE

In [None]:
from functools import lru_cache

# =============================================================================
# RBS (RIBOSOME BINDING SITE) PREDICTION
# =============================================================================

def find_purine_rich_regions_new(
    sequence: str, 
    min_length: int = 4, 
    min_purine_content: float = 0.6
) -> List[Dict]:
    """Find purine-rich regions using sliding window optimization."""
    purine_regions = []
    seq_len = len(sequence)
    
    if seq_len < min_length:
        return purine_regions
    
    is_purine = [1 if base in 'AG' else 0 for base in sequence]
    
    for start in range(seq_len):
        max_length = min(9, seq_len - start + 1)
        
        if max_length > min_length:
            purine_count = sum(is_purine[start:start + min_length])
            
            length = min_length
            if length <= seq_len - start:
                purine_fraction = purine_count / length
                if purine_fraction >= min_purine_content:
                    purine_regions.append({
                        'sequence': sequence[start:start + length],
                        'start': start,
                        'end': start + length,
                        'purine_content': purine_fraction,
                        'length': length
                    })
            
            for length in range(min_length + 1, max_length):
                if start + length > seq_len:
                    break
                
                purine_count += is_purine[start + length - 1]
                
                purine_fraction = purine_count / length
                if purine_fraction >= min_purine_content:
                    purine_regions.append({
                        'sequence': sequence[start:start + length],
                        'start': start,
                        'end': start + length,
                        'purine_content': purine_fraction,
                        'length': length
                    })
    
    return purine_regions


@lru_cache(maxsize=10000)
def score_motif_similarity_new(sequence: str) -> Tuple[float, str]:
    """Score sequence similarity to known RBS motifs."""
    best_score = 0.0
    best_motif = None
    
    for motif in KNOWN_RBS_MOTIFS:
        for offset in range(max(len(sequence), len(motif))):
            matches = 0
            total_positions = 0
            
            for i in range(len(sequence)):
                motif_pos = i + offset
                if 0 <= motif_pos < len(motif):
                    total_positions += 1
                    if sequence[i] == motif[motif_pos]:
                        matches += 1
            
            if total_positions > 0:
                similarity = matches / total_positions
                
                overlap_length = total_positions
                motif_weight = len(motif) / 6.0  
                
                score = similarity * overlap_length * motif_weight
                
                if score > best_score:
                    best_score = score
                    best_motif = motif
    
    return best_score, best_motif


def predict_rbs_simple_new(sequence: str, orf: Dict, upstream_length: int = 20) -> Dict:
    """Predict RBS using purine content, spacing, and motif similarity."""
    start_pos = orf['start']
    
    if start_pos < upstream_length:
        return {
            'rbs_score': -5.0,
            'spacing_score': 0.0,
            'motif_score': 0.0,
            'best_sequence': None,
            'best_motif': None,
            'spacing': 0,
            'position': 0
        }

    upstream_start = start_pos - upstream_length
    upstream_seq = sequence[upstream_start:start_pos]
    
    purine_regions = find_purine_rich_regions_new(upstream_seq, min_length=4, min_purine_content=0.6)
    
    best_score = -5.0
    best_prediction = None
    
    for region in purine_regions:
        sd_candidate = region['sequence']
        spacing = len(upstream_seq) - region['end']
        
        if spacing < 4 or spacing > 12:
            continue
        elif 6 <= spacing <= 8:
            spacing_score= 3.0  # Optimal
        elif 5 <= spacing <= 10:
            spacing_score= 2.5  # good
        elif 4 <= spacing <= 12:
            spacing_score= 1.5  # ok
        
        motif_score, best_motif = score_motif_similarity_new(sd_candidate)
        purine_bonus = (region['purine_content'] - 0.6) * 2.0
        

        combined_score = (
            spacing_score * 2.0 +    
            motif_score * 1.5 +      
            purine_bonus             
        )
        
        if combined_score > best_score:
            best_score = combined_score
            best_prediction = {
                'rbs_score': combined_score,
                'spacing_score': spacing_score,
                'motif_score': motif_score,
                'best_sequence': sd_candidate,
                'best_motif': best_motif,
                'spacing': spacing,
                'position': region['start'],
                'purine_content': region['purine_content'],
                'length': region['length']
            }
    
    return best_prediction or {
        'rbs_score': -5.0,
        'spacing_score': 0.0,
        'motif_score': 0.0,
        'best_sequence': None,
        'best_motif': None,
        'spacing': 0,
        'position': 0
    }


# =============================================================================
# ORF DETECTION
# =============================================================================

def find_orfs_candidates_new(sequence: str, min_length: int = 100) -> List[Dict]:
    """Detect all ORF candidates with dual coordinates and RBS scores."""
    
    if hasattr(score_motif_similarity_new, 'cache_clear'):
        score_motif_similarity_new.cache_clear()
    
    orfs = []
    
    reverse_seq = str(Seq(sequence).reverse_complement())
    
    sequences = [
        ('forward', sequence),
        ('reverse', reverse_seq)
    ]
    seq_len = len(sequence)

    print("Detecting ORFs and calculating RBS...")

    for strand_name, seq in sequences:
        for frame in range(3):
            active_starts = [] 
            
            for i in range(frame, len(seq) - 2, 3):
                codon = seq[i:i+3]
                
                if len(codon) != 3:
                    break
                
                if codon in START_CODONS:
                    active_starts.append((i, codon))
                    
                elif codon in STOP_CODONS and active_starts:
                    for start_pos, start_codon in active_starts:
                        orf_length = i + 3 - start_pos
                        
                        if orf_length >= min_length:
                            # Create ORF with dual coordinates
                            if strand_name == 'forward':
                                orf = {
                                    'start': start_pos + 1,
                                    'end': i + 3,
                                    'genome_start': start_pos + 1,
                                    'genome_end': i + 3,
                                    'length': orf_length,
                                    'frame': frame,
                                    'strand': 'forward',
                                    'start_codon': start_codon,
                                    'sequence': seq[start_pos:i+3]
                                }
                            else:  # reverse strand
                                orf = {
                                    'start': start_pos + 1,
                                    'end': i + 3,
                                    'genome_start': seq_len - (i + 3) + 1,
                                    'genome_end': seq_len - start_pos,
                                    'length': orf_length,
                                    'frame': frame,
                                    'strand': 'reverse',
                                    'start_codon': start_codon,
                                    'sequence': seq[start_pos:i+3]
                                }
                            
                            # Calculate RBS for this ORF
                            rbs_result = predict_rbs_simple_new(seq, orf, upstream_length=20)
                            orf['rbs_score'] = rbs_result['rbs_score']
                            orf['rbs_motif'] = rbs_result.get('best_motif')
                            orf['rbs_spacing'] = rbs_result.get('spacing', 0)
                            orf['rbs_sequence'] = rbs_result.get('best_sequence')
                            
                            orfs.append(orf)
                    
                    active_starts = []
    
    print(f"Complete: {len(orfs):,} ORFs detected with RBS scores")
    return orfs

# ============================================================================
# SIMPLE COMPARISON: Original vs Optimized
# ============================================================================


In [None]:

from Bio import SeqIO
import time
import gc

# Load genome
record = SeqIO.read("test_data/NC_000913.3.fasta", "fasta")
sequence = str(record.seq)
print(f"Sequence: {len(sequence):,} bp\n")

# ============================================================================
# Test Original
# ============================================================================

print("="*80)
print("ORIGINAL VERSION")
print("="*80)

# Warm-up
print("Warm-up...", end=" ", flush=True)
_ = find_orfs_candidates(sequence, min_length=100)
print("done\n")

# Actual runs
times_original = []
for i in range(3):
    gc.collect()  # Clean memory
    print(f"Run {i+1}...", end=" ", flush=True)
    
    start = time.perf_counter()
    orfs_original = find_orfs_candidates(sequence, min_length=100)
    elapsed = time.perf_counter() - start
    
    times_original.append(elapsed)
    print(f"{elapsed:.2f}s")

avg_original = sum(times_original) / len(times_original)
print(f"\nAverage: {avg_original:.2f}s")
print(f"Range:   {min(times_original):.2f}s - {max(times_original):.2f}s")

# ============================================================================
# Test Optimized
# ============================================================================

print(f"\n{'='*80}")
print("OPTIMIZED VERSION")
print("="*80)

# Warm-up
print("Warm-up...", end=" ", flush=True)
_ = find_orfs_candidates_new(sequence, min_length=100)
print("done\n")

# Actual runs
times_optimized = []
for i in range(3):
    gc.collect()  # Clean memory
    print(f"Run {i+1}...", end=" ", flush=True)
    
    start = time.perf_counter()
    orfs_optimized = find_orfs_candidates_new(sequence, min_length=100)
    elapsed = time.perf_counter() - start
    
    times_optimized.append(elapsed)
    print(f"{elapsed:.2f}s")

avg_optimized = sum(times_optimized) / len(times_optimized)
print(f"\nAverage: {avg_optimized:.2f}s")
print(f"Range:   {min(times_optimized):.2f}s - {max(times_optimized):.2f}s")

# ============================================================================
# Verification
# ============================================================================

print(f"\n{'='*80}")
print("VERIFICATION")
print("="*80)

# Check counts
print(f"\nORF counts:")
print(f"  Original:  {len(orfs_original):,}")
print(f"  Optimized: {len(orfs_optimized):,}")

# Check RBS scores
rbs_matches = sum(
    1 for i in range(len(orfs_original))
    if orfs_original[i].get('rbs_score') == orfs_optimized[i].get('rbs_score')
)

print(f"\nRBS score matches: {rbs_matches:,} / {len(orfs_original):,}")

all_match = (len(orfs_original) == len(orfs_optimized) and 
             rbs_matches == len(orfs_original))

# ============================================================================
# Results
# ============================================================================

print(f"\n{'='*80}")
print("FINAL RESULTS")
print("="*80)

time_saved = avg_original - avg_optimized
percent_saved = (time_saved / avg_original) * 100

print(f"\nOriginal:  {avg_original:.2f}s")
print(f"Optimized: {avg_optimized:.2f}s")
print(f"Saved:     {time_saved:.2f}s ({percent_saved:.1f}%)")
print(f"Correct:   {'YES' if all_match else 'NO'}")

# Variance check
var_original = max(times_original) - min(times_original)
var_optimized = max(times_optimized) - min(times_optimized)

print(f"\nVariance:")
print(f"  Original:  {var_original:.2f}s ({var_original/avg_original*100:.1f}%)")
print(f"  Optimized: {var_optimized:.2f}s ({var_optimized/avg_optimized*100:.1f}%)")

if max(var_original/avg_original, var_optimized/avg_optimized) > 0.15:
    print("\n[WARNING] High variance detected (>15%)")
    print("System performance may be unstable")
else:
    print("\n[OK] Acceptable variance")

if all_match:
    if time_saved > 0:
        print(f"\n[SUCCESS] Optimization works and saves {time_saved:.1f}s!")
        print("[READY FOR COMMIT]")
    else:
        print(f"\n[ISSUE] Optimized version is slower by {-time_saved:.1f}s")
else:
    print("\n[FAIL] Results don't match - DO NOT COMMIT")

print("="*80)

Sequence: 4,641,652 bp

ORIGINAL VERSION
Warm-up... Detecting ORFs and calculating RBS...
Complete: 176,315 ORFs detected with RBS scores
done

Run 1... Detecting ORFs and calculating RBS...
