In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gene-sequences-dataframe/gene_sequences.csv


### Necessary Imports

In [None]:
# necessary imports

!pip install scikeras

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras 
from keras import layers, models
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, LSTM, RepeatVector, Reshape, TimeDistributed, Conv1D, MaxPooling1D, Flatten
from keras.callbacks import EarlyStopping
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
from scipy.stats import ttest_ind

In [None]:
# function definitions 

def onehote(sequence):
    # map each nucelotide in the string as a number 
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "U": 3}

    one_hot_encoded = []

    for nuc in sequence:
        # Check if nucleotide is valid 
        if nuc in mapping:

            encoded = np.eye(4)[mapping[nuc]]  
            one_hot_encoded.append(encoded) 
        else:
            # If nucleotide is not valid, skip it
            print(f"Warning: Invalid nucleotide '{nuc}' encountered. Skipping.")
            return None
        
    return np.array(one_hot_encoded)

In [None]:
# Check for TPU availability
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # Detect TPU
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)  # Create TPU strategy
    print("Running on TPU:", tpu.master())
except ValueError:
    strategy = tf.distribute.get_strategy()  # Default strategy if TPU is not available
    print("Running on default strategy")

print(f"Number of replicas: {strategy.num_replicas_in_sync}")

In [None]:
# Configure GPU

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
        tf.config.set_visible_devices(gpus[0], 'GPU')
        print("Using GPU:", gpus[0])
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found!")

### Load Data and Change Structure

In [None]:
features_df = pd.read_csv("/kaggle/input/gene-sequences-dataframe/gene_sequences.csv")
features_df.head()

In [None]:
# Drop gene_name and separate features and target 
X = features_df.drop(columns=['gene_name'])
y = (features_df['regen']).astype(int)  
y = y.to_numpy()

# Choose only numeric features for modeling
X_col = X.columns
print(X_col)
X_numeric_features = ["sequence_length", "gc_content", "at_gc_ratio", 'kmer_3_GGG',
       'kmer_3_GGC', 'kmer_3_GCG', 'kmer_3_CGG', 'kmer_3_GCT', 'kmer_3_CTG',
       'kmer_3_TGC', 'kmer_3_GCC', 'kmer_3_CCG', 'kmer_3_CGC', 'kmer_3_CCA',
       'kmer_3_CAG', 'kmer_3_AGC', 'kmer_3_CCC', 'kmer_3_CTA', 'kmer_3_TAA',
       'kmer_3_AAG', 'kmer_3_AGG', 'kmer_3_CTC', 'kmer_3_TCG', 'kmer_3_GGA',
       'kmer_3_GAG', 'kmer_3_AGA', 'kmer_3_TCT', 'kmer_3_TGT', 'kmer_3_GTT',
       'kmer_3_TTT', 'kmer_3_TTC', 'kmer_3_TCC', 'kmer_3_AGT', 'kmer_3_GTC',
       'kmer_3_GTG', 'kmer_3_TGG', 'kmer_3_CTT', 'kmer_3_TTA', 'kmer_3_AAA',
       'kmer_3_CAC', 'kmer_3_ACT', 'kmer_3_CGT', 'kmer_3_CAA', 'kmer_3_AAT',
       'kmer_3_ATG', 'kmer_3_TAT', 'kmer_3_TGA', 'kmer_3_GAC', 'kmer_3_ACA',
       'kmer_3_TCA', 'kmer_3_CCT', 'kmer_3_ACG', 'kmer_3_GAT', 'kmer_3_ATC',
       'kmer_3_CGA', 'kmer_3_CAT', 'kmer_3_TAC', 'kmer_3_AAC', 'kmer_3_GCA',
       'kmer_3_GAA', 'kmer_3_ACC', 'kmer_3_GGT', 'kmer_3_ATT', 'kmer_3_TTG',
       'kmer_3_ATA', 'kmer_3_GTA', 'kmer_3_TAG', 'kmer_3_CAN', 'kmer_3_ANN',
       'kmer_3_NNN', 'kmer_3_NNT', 'kmer_3_NTA']

X = X.drop(columns = X_numeric_features)
X.head()

In [None]:
# determine longest sequence
max_len = 0
for seq in X['sequence']:
    seq_len = len(seq)
    if seq_len > max_len:
        max_len = seq_len 

In [None]:
# one hot encode each sequence and store in a NumPy array
sequences_ohe = []
updated_Y = [] 
for i, seq in enumerate(X['sequence']):
    output = onehote(seq)
    if output is None:
        continue
    else:
        sequences_ohe.append(output)
        updated_Y.append(y[i]) 
sequences_X_padded = pad_sequences(sequences_ohe, padding='post', maxlen = max_len, dtype='float32', value=0)

# Replace the original Y_train with the updated one
Y = np.array(updated_Y)

# train, test split
X_train, X_test, Y_train, Y_test = train_test_split(sequences_X_padded, Y, test_size=0.2, random_state=46)

In [None]:
# convert to tensors for TPU

X_train = tf.convert_to_tensor(X_train, dtype=tf.float32)
X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
Y_train = tf.convert_to_tensor(Y_train, dtype=tf.float32)
Y_test = tf.convert_to_tensor(Y_test, dtype=tf.float32)

# adjust the dataset to require lower memory
batch_size = 4

# Use tf.data for better input pipeline performance
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
validation_dataset = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

print(X_train.shape)  # Check shape of input
print(Y_train.shape)  # Check shape of labels
print(X_test.shape)  # Check shape of input
print(Y_test.shape)  # Check shape of labels

input_shape = X_train.shape


### Model 1: Simple CNN/LSTM

In [None]:
# Encoder
input_layer = Input(shape=(input_shape[1], input_shape[2]))
x = Conv1D(filters=16, kernel_size=3, activation='relu')(input_layer)
x = MaxPooling1D(pool_size=4)(x)
x = Dropout(0.3)(x)

# LSTM for Encoder
encoded = LSTM(32, activation='tanh', recurrent_activation='sigmoid')(x)

# Decoder
decoded = RepeatVector(input_shape[1])(encoded)
decoded = LSTM(32, return_sequences=True, activation='tanh', recurrent_activation='sigmoid')(decoded)
decoded = Dropout(0.3)(decoded)

# Output Layer
output_layer = TimeDistributed(Dense(1, activation='sigmoid'))(decoded)

# Model
autoencoder = Model(inputs=input_layer, outputs=output_layer)

# Define EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',     # Metric to monitor (validation loss in this case)
    patience=3,             # Number of epochs with no improvement after which training stops
    restore_best_weights=True  # Restores model weights from the epoch with the best validation loss
)

# Compile the Model
autoencoder.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train with GPU
with tf.device('/GPU:0'):
    autoencoder.fit(
        train_dataset,
        epochs=10,
        validation_data=validation_dataset,
        callbacks=[early_stopping],
        verbose=1
    )


In [None]:
# visualize results

# Plot training history
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title("Training History: CNN/LSTM with Encoder and Decoder Network")
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

y_pred = model.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)

cm = confusion_matrix(Y_test, y_pred_binary)
# Plot confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Not Associated", "Associated"], yticklabels=["Not Associated", "Associated"])
plt.title("Confusion Matrix: All Numeric Features with GridSearch")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Classification report
print(classification_report(Y_test, y_pred_binary, target_names=["Not Associated", "Associated"]))

print("Test Accuracy:", accuracy_score(Y_test, y_pred_binary))