In [1]:
import gzip
import json
import numpy as np
import pandas as pd
import os
import pickle
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# 1. Load Trained Model

In [2]:
# Load the trained CNN model
model_path = "../models/final_cnn_model.keras"
legacy_model_path = "../models/final_cnn_model_legacy.h5"

print("🔄 Loading trained CNN model...")

try:
    # Try to load modern Keras format first
    if os.path.exists(model_path):
        model = keras.models.load_model(model_path)
        print(f"✅ Successfully loaded model from: {model_path}")
    elif os.path.exists(legacy_model_path):
        model = keras.models.load_model(legacy_model_path)
        print(f"✅ Successfully loaded legacy model from: {legacy_model_path}")
    else:
        raise FileNotFoundError("No trained model found!")
        
    # Display model architecture
    print(f"\n📊 Model Architecture:")
    print(f"  Input shape: {model.input_shape}")
    print(f"  Output shape: {model.output_shape}")
    print(f"  Total parameters: {model.count_params():,}")
    
    # Show model summary
    model.summary()
    
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Make sure you have trained the model first using cnn.ipynb")
    raise

🔄 Loading trained CNN model...
✅ Successfully loaded model from: ../models/final_cnn_model.keras

📊 Model Architecture:
  Input shape: (None, 3, 57)
  Output shape: (None, 1)
  Total parameters: 69,089
✅ Successfully loaded model from: ../models/final_cnn_model.keras

📊 Model Architecture:
  Input shape: (None, 3, 57)
  Output shape: (None, 1)
  Total parameters: 69,089


  saveable.load_own_variables(weights_store.get(inner_path))


# 2. Define Feature Extraction Functions

In [3]:
# Feature extraction functions (same as training)
def flatten_deep_data(file_path, estimated_rows=11000000):
    """
    Ultra-optimized data loading with batch processing
    """
    BATCH_SIZE = 10000
    GROWTH_FACTOR = 1.5
    
    # Pre-allocate arrays
    capacity = estimated_rows
    transcript_ids = np.empty(capacity, dtype=object)
    positions = np.empty(capacity, dtype=np.int32)
    seq = np.empty(capacity, dtype=object)
    feature_arrays = [np.empty(capacity, dtype=np.float32) for _ in range(9)]
    
    idx = 0
    
    # Temporary batch storage
    batch_transcript_ids = []
    batch_positions = []
    batch_seq = []
    batch_features = [[] for _ in range(9)]
    
    def flush_batch():
        """Flush batch to main arrays"""
        nonlocal idx, capacity
        
        batch_size = len(batch_transcript_ids)
        if batch_size == 0:
            return
        
        # Resize if needed
        while idx + batch_size > capacity:
            new_capacity = int(capacity * GROWTH_FACTOR)
            transcript_ids.resize(new_capacity, refcheck=False)
            positions.resize(new_capacity, refcheck=False)
            seq.resize(new_capacity, refcheck=False)
            for i in range(9):
                feature_arrays[i].resize(new_capacity, refcheck=False)
            capacity = new_capacity
        
        # Bulk assignment
        transcript_ids[idx:idx+batch_size] = batch_transcript_ids
        positions[idx:idx+batch_size] = batch_positions
        seq[idx:idx+batch_size] = batch_seq
        for i in range(9):
            feature_arrays[i][idx:idx+batch_size] = batch_features[i]
        
        idx += batch_size
        
        # Clear batch
        batch_transcript_ids.clear()
        batch_positions.clear()
        batch_seq.clear()
        for lst in batch_features:
            lst.clear()
    
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            data = json.loads(line)
            for transcript_id, positions_dict in data.items():
                for transcript_position, sequences in positions_dict.items():
                    pos_int = int(transcript_position)
                    for sequence, feature_list in sequences.items():
                        for features in feature_list:
                            # Add to batch
                            batch_transcript_ids.append(transcript_id)
                            batch_positions.append(pos_int)
                            batch_seq.append(sequence)
                            for i, val in enumerate(features):
                                batch_features[i].append(val)
                            
                            # Flush when batch is full
                            if len(batch_transcript_ids) >= BATCH_SIZE:
                                flush_batch()
                                
                                if idx % 100000 == 0:
                                    print(f"Processed {idx:,} rows...", end='\r')
    
    # Flush remaining batch
    flush_batch()
    
    print(f"\nCreating DataFrame with {idx:,} rows...")
    
    # Trim and create DataFrame
    df = pd.DataFrame({
        'transcript_id': transcript_ids[:idx],
        'transcript_position': positions[:idx],
        'sequence': seq[:idx],
        'dwell_-1': feature_arrays[0][:idx],
        'std_-1': feature_arrays[1][:idx],
        'mean_-1': feature_arrays[2][:idx],
        'dwell_0': feature_arrays[3][:idx],
        'std_0': feature_arrays[4][:idx],
        'mean_0': feature_arrays[5][:idx],
        'dwell_+1': feature_arrays[6][:idx],
        'std_+1': feature_arrays[7][:idx],
        'mean_+1': feature_arrays[8][:idx],
    })
    
    return df

def create_cnn_features(group):
    """Create CNN features with sliding window 5-mer approach (same as training)"""
    features = {}
    
    # Define positions and their feature types
    positions = ['-1', '0', '+1']
    feature_types = ['dwell', 'std', 'mean']
    
    # 1. ORIGINAL SIGNAL FEATURES (preserving spatial organization)
    for pos in positions:
        for feat_type in feature_types:
            col = f'{feat_type}_{pos}'
            # Create all 9 statistics for this feature at this position
            features[f'{col}_mean'] = group[col].mean()
            features[f'{col}_median'] = group[col].median()
            features[f'{col}_std'] = group[col].std()
            features[f'{col}_iqr'] = group[col].quantile(0.75) - group[col].quantile(0.25)
            features[f'{col}_skew'] = group[col].skew()
            features[f'{col}_min'] = group[col].min()
            features[f'{col}_max'] = group[col].max()
            features[f'{col}_q25'] = group[col].quantile(0.25)
            features[f'{col}_q75'] = group[col].quantile(0.75)
    
    # 2. SEQUENCE ONE-HOT ENCODING (sliding window 5-mers)
    consensus_sequence = group['sequence'].iloc[0]  # 7-mer sequence
    nucleotides = ['A', 'C', 'G', 'T']
    
    # Define 5-mer windows for each CNN position
    cnn_position_windows = {
        '-1': [0, 1, 2, 3, 4],  # Characters 1-5 (indices 0-4)
        '0':  [1, 2, 3, 4, 5],  # Characters 2-6 (indices 1-5)
        '+1': [2, 3, 4, 5, 6]   # Characters 3-7 (indices 2-6)
    }
    
    # For each CNN position, create one-hot features for the 5-mer window
    for pos in positions:
        seq_indices = cnn_position_windows[pos]
        
        for seq_idx in seq_indices:
            if seq_idx < len(consensus_sequence):
                nucleotide = consensus_sequence[seq_idx]
                seq_pos_label = seq_idx - 3  # Convert to relative position (-3 to +3)
                
                # Create one-hot encoding for this sequence position
                for nt in nucleotides:
                    features[f'seq_pos{seq_pos_label}_{nt}_{pos}'] = 1 if nucleotide == nt else 0
    
    # 3. ADDITIONAL SEQUENCE FEATURES (5-mer composition)
    for pos in positions:
        seq_indices = cnn_position_windows[pos]
        
        # Count nucleotides for this CNN position's 5-mer window
        pos_sequence = ''.join([consensus_sequence[i] for i in seq_indices if i < len(consensus_sequence)])
        
        for nt in nucleotides:
            features[f'nt_count_{nt}_{pos}'] = pos_sequence.count(nt)
            features[f'nt_freq_{nt}_{pos}'] = pos_sequence.count(nt) / len(pos_sequence) if pos_sequence else 0
        
        # Purine/Pyrimidine for this position's 5-mer
        purines = sum(1 for n in pos_sequence if n in ['A', 'G'])
        features[f'purine_count_{pos}'] = purines
        features[f'purine_freq_{pos}'] = purines / len(pos_sequence) if pos_sequence else 0
    
    return pd.Series(features)

print("✅ Feature extraction functions defined")

✅ Feature extraction functions defined


# 3. Data Preparation Functions

In [4]:
def prepare_inference_data(features_df, scaler=None):
    """Prepare data for CNN inference (no labels needed)"""
    
    print(f"🔄 Preparing inference data...")
    print(f"Total features available: {features_df.shape[1]}")
    
    # Organize features by position for proper CNN spatial structure
    feature_names = features_df.columns
    
    # Group ALL features by position (-1, 0, +1)
    pos_minus1_features = [f for f in feature_names if '_-1' in f]
    pos_0_features = [f for f in feature_names if '_0' in f and '_-1' not in f and '_+1' not in f]  
    pos_plus1_features = [f for f in feature_names if '_+1' in f]
    
    print(f"Features per position: -1={len(pos_minus1_features)}, 0={len(pos_0_features)}, +1={len(pos_plus1_features)}")
    
    # Check that features are evenly distributed across positions
    if len(pos_minus1_features) != len(pos_0_features) or len(pos_0_features) != len(pos_plus1_features):
        print(f"⚠️  Warning: Uneven feature distribution across positions!")
        print(f"Position -1 features: {len(pos_minus1_features)}")
        print(f"Position 0 features: {len(pos_0_features)}")  
        print(f"Position +1 features: {len(pos_plus1_features)}")
    
    features_per_position = len(pos_minus1_features)
    
    # Create properly ordered feature matrix
    ordered_features = pos_minus1_features + pos_0_features + pos_plus1_features
    feature_matrix = features_df[ordered_features].values
    
    # Reshape to (N_samples, 3_positions, features_per_position)
    feature_matrix = feature_matrix.reshape(-1, 3, features_per_position)
    print(f"Reshaped feature matrix: {feature_matrix.shape} (samples, positions, features_per_position)")
    
    # Scale features if scaler provided
    if scaler is not None:
        # Reshape to 2D, scale, reshape back
        original_shape = feature_matrix.shape
        feature_matrix_scaled = scaler.transform(feature_matrix.reshape(-1, features_per_position)).reshape(original_shape)
        print(f"✅ Features scaled using provided scaler")
        return feature_matrix_scaled
    else:
        print("⚠️  No scaler provided - using raw features (may affect performance)")
        return feature_matrix

def create_submission_file(transcript_ids, positions, predictions, output_path):
    """Create submission file with predictions"""
    
    submission_df = pd.DataFrame({
        'transcript_id': transcript_ids,
        'transcript_position': positions,
        'score': predictions.flatten()
    })
    
    # Create predictions directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save submission file
    submission_df.to_csv(output_path, index=False)
    
    print(f"✅ Submission file saved: {output_path}")
    print(f"   Shape: {submission_df.shape}")
    print(f"   Sample predictions:")
    print(submission_df.head())
    
    # Summary statistics
    print(f"\n📊 Prediction Statistics:")
    print(f"   Mean score: {submission_df['score'].mean():.4f}")
    print(f"   Std score: {submission_df['score'].std():.4f}")
    print(f"   Min score: {submission_df['score'].min():.4f}")
    print(f"   Max score: {submission_df['score'].max():.4f}")
    print(f"   Predicted positives (>0.5): {(submission_df['score'] > 0.5).sum()}")
    print(f"   Predicted positives (%): {(submission_df['score'] > 0.5).mean() * 100:.2f}%")
    
    return submission_df

print("✅ Data preparation functions defined")

✅ Data preparation functions defined


# 4. Inference Pipeline

In [5]:
def run_inference_pipeline(dataset_name, dataset_file, scaler=None):
    """
    Complete inference pipeline for a dataset
    
    Args:
        dataset_name: Name for output file (e.g., 'dataset0', 'dataset1')
        dataset_file: Path to dataset JSON.gz file
        scaler: Fitted scaler from training (optional but recommended)
    """
    
    print(f"\n🚀 Starting inference pipeline for {dataset_name}")
    print("=" * 60)
    
    # 1. Load and process data
    print(f"📁 Loading data from {dataset_file}...")
    df = flatten_deep_data(dataset_file)
    
    print(f"📊 Dataset statistics:")
    print(f"   Total rows: {len(df):,}")
    print(f"   Unique positions: {df.groupby(['transcript_id', 'transcript_position']).ngroups:,}")
    
    # 2. Extract features
    print(f"\n🔧 Extracting CNN features...")
    tqdm.pandas()
    cnn_features = df.groupby(['transcript_id', 'transcript_position']).progress_apply(
        create_cnn_features, include_groups=False
    )
    
    print(f"✅ Feature extraction complete:")
    print(f"   Feature matrix shape: {cnn_features.shape}")
    print(f"   Features per position: {cnn_features.shape[1] // 3}")
    
    # 3. Prepare data for inference
    X_inference = prepare_inference_data(cnn_features, scaler=scaler)
    
    # 4. Generate predictions
    print(f"\n🤖 Generating predictions...")
    predictions = model.predict(X_inference, batch_size=256, verbose=1)
    
    # 5. Prepare output data
    transcript_ids = [idx[0] for idx in cnn_features.index]
    positions = [idx[1] for idx in cnn_features.index]
    
    # 6. Create submission file
    output_path = f"../predictions/{dataset_name}_predictions_cnn.csv"
    submission_df = create_submission_file(transcript_ids, positions, predictions, output_path)
    
    print(f"\n✅ Inference pipeline completed for {dataset_name}")
    return submission_df

print("✅ Inference pipeline function defined")

✅ Inference pipeline function defined


# 5. Run Inference on Available Datasets

In [6]:
# Discover available datasets
data_dir = "../data/"
available_datasets = []

for file in os.listdir(data_dir):
    if file.startswith("dataset") and file.endswith(".json.gz"):
        dataset_name = file.replace(".json.gz", "")
        dataset_path = os.path.join(data_dir, file)
        available_datasets.append((dataset_name, dataset_path))

print(f"🔍 Found {len(available_datasets)} datasets:")
for name, path in available_datasets:
    print(f"   - {name}: {path}")

if not available_datasets:
    print("⚠️  No datasets found in ../data/ directory")
    print("   Make sure dataset files are named like: dataset0.json.gz, dataset1.json.gz, etc.")
else:
    print(f"\n✅ Ready to run inference on {len(available_datasets)} datasets")

🔍 Found 3 datasets:
   - dataset0: ../data/dataset0.json.gz
   - dataset1: ../data/dataset1.json.gz
   - dataset2: ../data/dataset2.json.gz

✅ Ready to run inference on 3 datasets


In [7]:
# Run inference on all available datasets
submission_files = {}

scaler_path = "../models/cnn_scaler.pkl"
scaler = None

if os.path.exists(scaler_path):
    with open(scaler_path, 'rb') as f:
        scaler = pickle.load(f)
    print(f"✅ Loaded fitted scaler from: {scaler_path}")
else:
    print(f"⚠️  No saved scaler found at {scaler_path}")
    print("   Using raw features (may reduce performance)")
    print("   Consider saving the scaler during training for better results")

for dataset_name, dataset_path in available_datasets:
    try:
        print(f"\n{'='*80}")
        print(f"🚀 Processing {dataset_name}")
        print(f"{'='*80}")
        
        # Run inference pipeline
        submission_df = run_inference_pipeline(dataset_name, dataset_path, scaler=scaler)
        submission_files[dataset_name] = submission_df
        
        print(f"✅ Successfully processed {dataset_name}")
        
    except Exception as e:
        print(f"❌ Error processing {dataset_name}: {e}")
        continue

print(f"\n🎉 Inference completed!")
print(f"📁 Generated {len(submission_files)} submission files:")
for dataset_name in submission_files.keys():
    print(f"   - ../predictions/{dataset_name}_predictions_cnn.csv")

✅ Loaded fitted scaler from: ../models/cnn_scaler.pkl

🚀 Processing dataset0

🚀 Starting inference pipeline for dataset0
📁 Loading data from ../data/dataset0.json.gz...
Processed 11,000,000 rows...
Creating DataFrame with 11,027,106 rows...

Creating DataFrame with 11,027,106 rows...
📊 Dataset statistics:
   Total rows: 11,027,106
📊 Dataset statistics:
   Total rows: 11,027,106
   Unique positions: 121,838

🔧 Extracting CNN features...
   Unique positions: 121,838

🔧 Extracting CNN features...


  9%|▉         | 11423/121838 [03:10<30:40, 59.99it/s] 



KeyboardInterrupt: 

# 6. Optional: Single Dataset Inference

Use this cell to run inference on a specific dataset if needed:

In [None]:
# Example: Run inference on a specific dataset
# dataset_path = "../data/dataset0.json.gz"
# output_path = run_inference_on_dataset(model, dataset_path)
# print(f"Predictions saved to: {output_path}")