<a href="https://colab.research.google.com/github/ryrynbob/ust-deep-learning-2026/blob/main/AssignmentWeek4_Ryan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 4

## Ryan Nguyen

## Special Topics

## Imports and Device Info

In [None]:
import time
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling2D, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split



In [None]:
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {len(tf.config.list_physical_devices('GPU')) > 0}")


TensorFlow version: 2.19.0
GPU Available: True


## Create Synthetic Data for Method 1

In [None]:
# 1. CREATE SYNTHETIC DATA
# ======================
samples = 5000
seq_length = 60
channels = 4
num_classes = 3

##  OR

## Download Dataset Method 2

In [None]:
# ======================
# 2. DOWNLOAD DATASET
# ======================
import gdown
import os

In [None]:
# Download labels
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1QbOSExVJEbPMhjzaua5n2eIXeF3qELQ7' -O "label.txt"

# Download encoded sequences
!gdown --id '1Sh2ce0jo5FVGNsSa9fqLjqcAOWQBFhzz' -O "encoded_seq.txt"

--2026-02-10 00:16:59--  https://drive.google.com/uc?export=download&id=1QbOSExVJEbPMhjzaua5n2eIXeF3qELQ7
Resolving drive.google.com (drive.google.com)... 172.253.118.101, 172.253.118.102, 172.253.118.139, ...
Connecting to drive.google.com (drive.google.com)|172.253.118.101|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1QbOSExVJEbPMhjzaua5n2eIXeF3qELQ7&export=download [following]
--2026-02-10 00:16:59--  https://drive.usercontent.google.com/download?id=1QbOSExVJEbPMhjzaua5n2eIXeF3qELQ7&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 74.125.200.132, 2404:6800:4003:c00::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|74.125.200.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60000 (59K) [application/octet-stream]
Saving to: ‘label.txt’


2026-02-10 00:17:00 (137 MB/s) - ‘label.txt’ saved [60000/60000]

D

## Load and Format Data

In [None]:
# ======================
# 3. LOAD AND FORMAT DATA
# ======================
def load_data():
    """Load and format the SpliceFinder dataset"""
    labels = np.loadtxt('label.txt')
    encoded_seq = np.loadtxt('encoded_seq.txt')

    # From SpliceFinder paper: each sequence is 400 bases, one-hot encoded (4 channels)
    # encoded_seq has shape (N, 1600) because 400 * 4 = 1600
    # We need to reshape to (N, 400, 4)

    # First, reshape to (N, 400, 4)
    x_reshaped = encoded_seq.reshape(-1, 400, 4)

    # Normalize if needed
    x_reshaped = x_reshaped.astype('float32')

    return x_reshaped, labels


In [None]:
# Load data
X, y = load_data()
print(f"Dataset shape - X: {X.shape}, y: {y.shape}")
print(f"Class distribution: {np.bincount(y.astype(int))}")

Dataset shape - X: (30000, 400, 4), y: (30000,)
Class distribution: [10000 10000 10000]


## Train Val Test Split

In [None]:
# ======================
# 4. TRAIN/VAL/TEST SPLIT
# ======================
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, {y_train.shape}")
print(f"Val: {X_val.shape}, {y_val.shape}")
print(f"Test: {X_test.shape}, {y_test.shape}")


Train: (19200, 400, 4), (19200,)
Val: (4800, 400, 4), (4800,)
Test: (6000, 400, 4), (6000,)


## Train Model

In [None]:
# ======================
# 5. BUILD SPLICEFINDER MODEL
# ======================
def build_splicefinder_model():
    """
    Build the exact model from SpliceFinder paper:
    - Input: (400, 4)
    - Conv1D: 50 filters, kernel=9, relu
    - Flatten
    - Dense: 100 units, relu
    - Dense: 3 units, softmax (EI, IE, N)
    """
    model = keras.Sequential([
        layers.Input(shape=(400, 4), name='input_layer'),
        layers.Conv1D(
            filters=50,
            kernel_size=9,
            activation='relu',
            padding='same',
            name='conv1d_layer'
        ),
        layers.Flatten(name='flatten_layer'),
        layers.Dense(100, activation='relu', name='dense_100'),
        layers.Dense(3, activation='softmax', name='output_layer')
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

In [None]:
# Build and summarize
model = build_splicefinder_model()
model.summary()


## Early Stopping Callback

In [None]:
# ======================
# 6. EARLY STOPPING CALLBACK
# ======================
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

## Train on CPU

In [None]:
# ======================
# 7. TRAIN ON CPU
# ======================
print("\n" + "="*60)
print("TRAINING ON CPU")
print("="*60)

# Clear session and set device
tf.keras.backend.clear_session()

with tf.device('/CPU:0'):
    # Build fresh model
    model_cpu = build_splicefinder_model()

    # Time the training
    start_time = time.time()

    history_cpu = model_cpu.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=1
    )

    cpu_time = time.time() - start_time

cpu_epochs = len(history_cpu.history['loss'])
print(f"\nCPU Training completed in {cpu_time:.2f} seconds")
print(f"CPU Training epochs: {cpu_epochs}")



TRAINING ON CPU
Epoch 1/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 33ms/step - accuracy: 0.8039 - loss: 0.4740 - val_accuracy: 0.9712 - val_loss: 0.1005
Epoch 2/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 30ms/step - accuracy: 0.9794 - loss: 0.0688 - val_accuracy: 0.9698 - val_loss: 0.0939
Epoch 3/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 33ms/step - accuracy: 0.9936 - loss: 0.0298 - val_accuracy: 0.9673 - val_loss: 0.1116
Epoch 4/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 30ms/step - accuracy: 0.9966 - loss: 0.0129 - val_accuracy: 0.9685 - val_loss: 0.1267
Epoch 5/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 33ms/step - accuracy: 0.9984 - loss: 0.0068 - val_accuracy: 0.9663 - val_loss: 0.1385
Epoch 5: early stopping
Restoring model weights from the end of the best epoch: 2.

CPU Training completed in 97.61 seconds
CPU Training epochs: 5


## Train on GPU

In [None]:

# ======================
# 8. TRAIN ON GPU (if available)
# ======================
print("\n" + "="*60)
print("TRAINING ON GPU")
print("="*60)

# Clear session
tf.keras.backend.clear_session()

# Check for GPU
if tf.config.list_physical_devices('GPU'):
    with tf.device('/GPU:0'):
        # Build fresh model
        model_gpu = build_splicefinder_model()

        # Time the training
        start_time = time.time()

        history_gpu = model_gpu.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=50,
            batch_size=32,
            callbacks=[early_stopping],
            verbose=1
        )

        gpu_time = time.time() - start_time

    gpu_epochs = len(history_gpu.history['loss'])
    print(f"\nGPU Training completed in {gpu_time:.2f} seconds")
    print(f"GPU Training epochs: {gpu_epochs}")
else:
    print("No GPU available. Skipping GPU training.")
    gpu_time = None
    gpu_epochs = None



TRAINING ON GPU
Epoch 1/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.8128 - loss: 0.4888 - val_accuracy: 0.9617 - val_loss: 0.1264
Epoch 2/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9783 - loss: 0.0772 - val_accuracy: 0.9690 - val_loss: 0.1006
Epoch 3/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9878 - loss: 0.0416 - val_accuracy: 0.9658 - val_loss: 0.1056
Epoch 4/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9938 - loss: 0.0224 - val_accuracy: 0.9650 - val_loss: 0.1425
Epoch 5/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9979 - loss: 0.0087 - val_accuracy: 0.9665 - val_loss: 0.1413
Epoch 5: early stopping
Restoring model weights from the end of the best epoch: 2.

GPU Training completed in 15.62 seconds
GPU Training epochs: 5


## Compare

In [None]:
# 9. COMPARISON WITH 40 EPOCHS (NO EARLY STOPPING)
# ======================
print("\n" + "="*60)
print("COMPARISON WITH 40 EPOCHS (NO EARLY STOPPING)")
print("="*60)


COMPARISON WITH 40 EPOCHS (NO EARLY STOPPING)


In [None]:
# Clear session
tf.keras.backend.clear_session()


In [None]:
# CPU - 40 epochs
print("\nCPU - Training for 40 epochs...")
with tf.device('/CPU:0'):
    model_cpu_40 = build_splicefinder_model()
    start_time = time.time()
    history_cpu_40 = model_cpu_40.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=40,
        batch_size=32,
        verbose=0
    )
    cpu_time_40 = time.time() - start_time
print(f"CPU - 40 epochs time: {cpu_time_40:.2f} seconds")



CPU - Training for 40 epochs...


KeyboardInterrupt: 

In [None]:
# GPU - 40 epochs
if tf.config.list_physical_devices('GPU'):
    print("\nGPU - Training for 40 epochs...")
    tf.keras.backend.clear_session()
    with tf.device('/GPU:0'):
        model_gpu_40 = build_splicefinder_model()
        start_time = time.time()
        history_gpu_40 = model_gpu_40.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=40,
            batch_size=32,
            verbose=0
        )
        gpu_time_40 = time.time() - start_time
    print(f"GPU - 40 epochs time: {gpu_time_40:.2f} seconds")
else:
    gpu_time_40 = None


In [None]:
# ======================
# 10. RESULTS COMPARISON
# ======================
print("\n" + "="*60)
print("FINAL RESULTS COMPARISON")
print("="*60)

print(f"\nEARLY STOPPING (patience=3):")
print(f"CPU Time: {cpu_time:.2f} seconds ({cpu_epochs} epochs)")
if gpu_time:
    print(f"GPU Time: {gpu_time:.2f} seconds ({gpu_epochs} epochs)")
    speedup_early = cpu_time / gpu_time
    print(f"Speedup (GPU vs CPU): {speedup_early:.2f}x")





FINAL RESULTS COMPARISON

EARLY STOPPING (patience=3):
CPU Time: 97.61 seconds (5 epochs)
GPU Time: 15.62 seconds (5 epochs)
Speedup (GPU vs CPU): 6.25x


In [None]:
# ======================
# 6. RESULTS COMPARISON
# ======================
print("\n" + "="*50)
print("RESULTS SUMMARY")
print("="*50)
print(f"CPU Training Time: {cpu_time:.2f} seconds ({cpu_epochs} epochs)")
if gpu_time:
    print(f"GPU Training Time: {gpu_time:.2f} seconds ({gpu_epochs} epochs)")
    speedup = cpu_time / gpu_time
    print(f"Speedup (GPU vs CPU): {speedup:.2f}x")
    print(f"GPU is {speedup:.2f} times faster than CPU")
else:
    print("GPU: Not available")


RESULTS SUMMARY
CPU Training Time: 21.81 seconds (8 epochs)
GPU Training Time: 8.99 seconds (4 epochs)
Speedup (GPU vs CPU): 2.43x
GPU is 2.43 times faster than CPU
