In [1]:
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import train_test_split
import gc # Garbage Collector interface
import random # To shuffle the data before subsampling

In [2]:
SEQ_LENGTH = 2000
# Using np.int8 is 8x more memory-efficient than the default float64
DNA_MAP = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1], 'N': [0, 0, 0, 0]}

def one_hot_encode(sequence, dtype=np.int8):
    """Converts a DNA sequence string into a one-hot encoded NumPy array."""
    one_hot = np.zeros((len(sequence), 4), dtype=dtype)
    for i, base in enumerate(sequence):
        one_hot[i, :] = DNA_MAP.get(base.upper(), DNA_MAP['N'])
    return one_hot

def process_fasta_file(filepath):
    """Reads a FASTA file and returns a list of sequence strings."""
    sequences = []
    for record in SeqIO.parse(filepath, "fasta"):
        sequences.append(str(record.seq))
    return sequences

In [3]:
from tensorflow.keras.models import load_model

# --- You would do this in a new script or notebook ---

# Load the saved model
loaded_model = load_model('promoter_classifier_model.h5')
print("Model loaded successfully.")

# Example of a new, unknown DNA sequence (must be 2000 bp long)
new_sequence_str = "NNNNNNNNCGGGTATAANNNNNNNN" * 100 # Replace with a real sequence
new_sequence_str = new_sequence_str[:2000]

# Preprocess it the same way as your training data
one_hot_new_seq = one_hot_encode(new_sequence_str) # Assumes you have the one_hot_encode function
# Add a 'batch' dimension
model_input = np.expand_dims(one_hot_new_seq, axis=0)

# Make a prediction
prediction_prob = loaded_model.predict(model_input)
prediction = "Promoter" if prediction_prob[0][0] > 0.5 else "Not a Promoter"

print(f"\nThe sequence was predicted as: {prediction} (Probability: {prediction_prob[0][0]:.4f})")



Model loaded successfully.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 253ms/step

The sequence was predicted as: Promoter (Probability: 0.9860)
