<a href="https://www.kaggle.com/code/samithsachidanandan/cafa-6-protein-function-prediction-tf?scriptVersionId=273529044" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

### IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
import os


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split
from keras import regularizers

2025-11-04 20:14:04.462481: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762287244.638837      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762287244.701750      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

TensorFlow version: 2.18.0
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### SETUP & CONFIGURATION

In [3]:
class config:
    MAIN_DIR = "/kaggle/input/cafa-6-protein-function-prediction"
    
    num_labels = 500
    n_epochs = 20  
    batch_size = 64 #128 
    lr = 0.0005 #5e-4  
    
  
    weight_decay = 1e-5
    
    
    use_mixed_precision = True
    
    device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'
    
    
print(f"Using device: {config.device}")


embeds_map = {
    "T5": "t5embeds",
    "ProtBERT": "protbert-embeddings-for-cafa5",
    "EMS2": "cafa-5-ems-2-embeddings-numpy"
}
embeds_dim = {
    "T5": 1024,
    "ProtBERT": 1024,
    "EMS2": 1280
}

Using device: /GPU:0


### Loading the Data

In [4]:
def load_protein_data(datatype, embeddings_source):
 
    base_path = f"/kaggle/input/{embeds_map[embeddings_source]}/"
    
  
    embeds_path = os.path.join(base_path, f"{datatype}_embeddings.npy")
    ids_path = os.path.join(base_path, f"{datatype}_ids.npy")
    

    if embeddings_source == "T5":
        embeds_path = os.path.join(base_path, f"{datatype}_embeds.npy")

    embeds = np.load(embeds_path)
    ids = np.load(ids_path)
    
    if datatype == "train":
        labels_path = f"/kaggle/input/train-targets-top{config.num_labels}/train_targets_top{config.num_labels}.npy"
        labels = np.load(labels_path)
        return embeds, labels, ids
    else:
        return embeds, ids

### MODEL ARCHITECTURE: 1D CNN 

we are building a 1D Convolutional Neural Network (CNN) for multi-label classification. Staring with input layer that reshapes the data so that is it fitted as per the NN requirements then we are applying 32 filters to get the baic features then 3 more Conv1D are applied to get the advances features. We are using GlobalAveragePooling layer so that the features are reduces to a compact form. Followed by dense layer and drop out to reduce overfitting. 

In [5]:
def build_cnn_model(input_dim, num_classes):

    inputs = layers.Input(shape=(input_dim,))
    x = layers.Reshape((input_dim, 1))(inputs)
    
    
    x = layers.Conv1D(64, 7, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.SpatialDropout1D(0.1)(x)
    

    residual = layers.Conv1D(128, 1, strides=2, padding='same')(x)
    residual = layers.BatchNormalization()(residual)
    
    x = layers.Conv1D(128, 5, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.SpatialDropout1D(0.1)(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Add()([x, residual])
    x = layers.Activation('relu')(x)
    
  
    residual = layers.Conv1D(256, 1, strides=2, padding='same')(x)
    residual = layers.BatchNormalization()(residual)
    
    x = layers.Conv1D(256, 3, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.SpatialDropout1D(0.2)(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Add()([x, residual])
    x = layers.Activation('relu')(x)
    
  
    residual = layers.Conv1D(512, 1, padding='same')(x)
    residual = layers.BatchNormalization()(residual)
    
    x = layers.Conv1D(512, 3, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.SpatialDropout1D(0.2)(x)
    x = layers.Add()([x, residual])
    x = layers.Activation('relu')(x)
    

    x = layers.GlobalAveragePooling1D()(x)
    

    x = layers.Dense(256, kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(0.5)(x)
    
    x = layers.Dense(128, kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(0.4)(x)
    
 
    
    outputs = layers.Dense(num_classes, activation='sigmoid')(x)
    
    model = models.Model(inputs=inputs, outputs=outputs)
    
    return model

### CUSTOM METRICS

F1-score is calculatedby tracking true positives, false positives, and false negatives during training. Predictions are converted to binary values using a defined threshold (default 0.5). From these values, precision and recall are computed, and the F1-score is derived.  The metric is reset after each epoch in order to track the metric correctly. The metric has configuration methods to ensure it is fully serializable and can be saved and loaded with the model. Keras will automatically serialize and deserialize this metric when the model is trained and reloaded, because the class is decorated with the @keras.utils.register_keras_serializable decorator.

In [6]:
@keras.utils.register_keras_serializable(package="Custom", name="MultilabelF1Score")
class MultilabelF1Score(keras.metrics.Metric):
    """
    F1 Score metric with LOWER threshold for focal loss compatibility
    """
    
    def __init__(self, num_labels=500, threshold=0.1, name='f1_score', **kwargs):
        # CRITICAL: Changed default threshold from 0.5 to 0.1
        super().__init__(name=name, **kwargs)
        self.num_labels = num_labels
        self.threshold = threshold
        self.true_positives = self.add_weight(name='tp', initializer='zeros')
        self.false_positives = self.add_weight(name='fp', initializer='zeros')
        self.false_negatives = self.add_weight(name='fn', initializer='zeros')
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.cast(y_pred > self.threshold, tf.float32)
        y_true = tf.cast(y_true, tf.float32)
        
        tp = tf.reduce_sum(y_true * y_pred)
        fp = tf.reduce_sum((1 - y_true) * y_pred)
        fn = tf.reduce_sum(y_true * (1 - y_pred))
        
        self.true_positives.assign_add(tp)
        self.false_positives.assign_add(fp)
        self.false_negatives.assign_add(fn)
    
    def result(self):
        precision = self.true_positives / (self.true_positives + self.false_positives + tf.keras.backend.epsilon())
        recall = self.true_positives / (self.true_positives + self.false_negatives + tf.keras.backend.epsilon())
        f1 = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
        return f1
    
    def reset_state(self):
        self.true_positives.assign(0)
        self.false_positives.assign(0)
        self.false_negatives.assign(0)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'num_labels': self.num_labels,
            'threshold': self.threshold
        })
        return config
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

### THRESHOLD FINDING FUNCTION

A function is created for the optimal prediction threshold that gives the highest F1-score

In [7]:
def find_best_threshold(model, X_val, y_val, thresholds=np.arange(0.05, 0.55, 0.05)):
    
    predictions = model.predict(X_val, batch_size=config.batch_size, verbose=0)
    
    best_f1 = 0
    best_thresh = 0.5
    threshold_scores = []
    
    for thresh in thresholds:
        y_pred_binary = (predictions > thresh).astype(np.float32)
        
        tp = np.sum(y_val * y_pred_binary)
        fp = np.sum((1 - y_val) * y_pred_binary)
        fn = np.sum(y_val * (1 - y_pred_binary))
        
        precision = tp / (tp + fp + 1e-7)
        recall = tp / (tp + fn + 1e-7)
        f1 = 2 * (precision * recall) / (precision + recall + 1e-7)
        
        threshold_scores.append({
            'threshold': thresh,
            'f1': f1,
            'precision': precision,
            'recall': recall
        })
        
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    
  
    df_scores = pd.DataFrame(threshold_scores)
    print("\nThreshold Analysis:")
    print(df_scores.to_string())
    
    return best_f1, best_thresh

In [8]:
def augment_embeddings(embeddings, labels, augment_factor=0.2):
    
    n_augment = int(len(embeddings) * augment_factor)
    
   
    indices = np.random.choice(len(embeddings), n_augment, replace=True)
    
    augmented_embeddings = embeddings[indices].copy()
    augmented_labels = labels[indices].copy()
    
 
    noise = np.random.normal(0, 0.01, augmented_embeddings.shape)
    augmented_embeddings += noise
    
   
    X_combined = np.vstack([embeddings, augmented_embeddings])
    y_combined = np.vstack([labels, augmented_labels])
    
    return X_combined, y_combined

### TRAINING FUNCTION

The function trains a convolutional neural network using the specified embedding source. It monitors performance on a validation set, calculates F1-score across multiple thresholds, and saves the model with the best F1 automatically. Finally, it returns the best model and optimal threshold for predictions.

In [9]:
def focal_loss(gamma=2.0, alpha=0.25):
    def loss(y_true, y_pred):
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)
        pt = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        alpha_factor = tf.where(tf.equal(y_true, 1), alpha, 1 - alpha)
        focal_weight = alpha_factor * tf.pow(1 - pt, gamma)
        loss = -focal_weight * tf.math.log(pt)
        return tf.reduce_mean(loss)
    return loss



def train_model(embeddings_source, model_type="convolutional", train_size=0.9, use_augmentation=True):
    print("Loading training data...")
    X_train_full, y_train_full, ids = load_protein_data("train", embeddings_source)
    

    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, 
        train_size=train_size, 
        random_state=42
    )
    
  
    if use_augmentation:
        print("Applying data augmentation...")
        X_train, y_train = augment_embeddings(X_train, y_train, augment_factor=0.2)
        print(f"Training samples after augmentation: {len(X_train)}")
    
    print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")
    
  
    model = build_cnn_model(
        input_dim=embeds_dim[embeddings_source], 
        num_classes=config.num_labels
    )
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.0005),
        loss=focal_loss(gamma=2.0, alpha=0.25),
        metrics=[
            MultilabelF1Score(num_labels=config.num_labels, threshold=0.1),
            keras.metrics.AUC(name='auc', multi_label=True)  # Add this
        ]
    )  

    checkpoint = ModelCheckpoint(
        'best_model.keras',
        monitor='val_auc',  
        save_best_only=True,
        mode='max',
        verbose=1
    )
    
    early_stop = EarlyStopping(
        monitor='val_auc',  
        patience=5,
        mode='max',
        restore_best_weights=True,
        verbose=1
    )
  
    history = model.fit(
        X_train, y_train,
        batch_size=config.batch_size,
        epochs=config.n_epochs,
        validation_data=(X_val, y_val),
        callbacks=[checkpoint, early_stop],
        verbose=1
    )
    
   
    
    
  
    best_val_f1, best_threshold = find_best_threshold(model, X_val, y_val)
    print(f"\nBest Validation F1: {best_val_f1:.4f} at threshold {best_threshold:.2f}")
    
    return model, best_threshold, history


ems2_model, best_threshold, history = train_model(
    embeddings_source="EMS2", 
    model_type="convolutional",
    use_augmentation=True  
)

Loading training data...
Applying data augmentation...
Training samples after augmentation: 153625
Training samples: 153625, Validation samples: 14225


I0000 00:00:1762287271.586349      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/20


I0000 00:00:1762287286.661035      97 service.cc:148] XLA service 0x7e599000d2f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1762287286.661725      97 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1762287287.719628      97 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m   5/2401[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:17[0m 32ms/step - auc: 0.4668 - f1_score: 0.0985 - loss: 0.6691  

I0000 00:00:1762287296.686723      97 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m2401/2401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - auc: 0.5010 - f1_score: 0.1082 - loss: 0.0909
Epoch 1: val_auc improved from -inf to 0.50035, saving model to best_model.keras
[1m2401/2401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 35ms/step - auc: 0.5010 - f1_score: 0.1082 - loss: 0.0909 - val_auc: 0.5003 - val_f1_score: 0.0986 - val_loss: 0.0172
Epoch 2/20
[1m2399/2401[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 30ms/step - auc: 0.5015 - f1_score: 0.1078 - loss: 0.0174
Epoch 2: val_auc did not improve from 0.50035
[1m2401/2401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 30ms/step - auc: 0.5015 - f1_score: 0.1078 - loss: 0.0174 - val_auc: 0.4989 - val_f1_score: 0.1034 - val_loss: 0.0170
Epoch 3/20
[1m2399/2401[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 30ms/step - auc: 0.4997 - f1_score: 0.1052 - loss: 0.0171
Epoch 3: val_auc improved from 0.50035 to 0.50545, saving model to best_model.keras
[1m2401/2401

In [10]:
def predict_with_tta(model, X_test, threshold, n_tta=5):
    
    print(f"Generating predictions with {n_tta} TTA iterations...")
    all_predictions = []
    
    for i in range(n_tta):
        if i == 0:
           
            preds = model.predict(X_test, batch_size=config.batch_size, verbose=0)
        else:
           
            X_noisy = X_test + np.random.normal(0, 0.005, X_test.shape)
            preds = model.predict(X_noisy, batch_size=config.batch_size, verbose=0)
        
        all_predictions.append(preds)
        print(f"TTA iteration {i+1}/{n_tta} complete")
    
    
    final_predictions = np.mean(all_predictions, axis=0)
    print("TTA averaging complete")
    
    return final_predictions

In [11]:
def predict_ensemble(models, embeddings_source, thresholds, use_tta=False):
    
    print("\n=== ENSEMBLE PREDICTION ===")
    print(f"Number of models in ensemble: {len(models)}")
    
    
    print("\nLoading test data...")
    X_test, test_ids = load_protein_data("test", embeddings_source)
    
 
    labels_df = pd.read_csv(os.path.join(config.MAIN_DIR, "Train/train_terms.tsv"), sep="\t")
    top_terms = labels_df.groupby("term")["EntryID"].count().sort_values(ascending=False)
    labels_names = top_terms.head(config.num_labels).index.values
    
    
    all_predictions = []
    for idx, (model, threshold) in enumerate(zip(models, thresholds)):
        print(f"\nModel {idx+1}/{len(models)} - Threshold: {threshold:.2f}")
        
        if use_tta:
            predictions = predict_with_tta(model, X_test, threshold, n_tta=3)
        else:
            predictions = model.predict(X_test, batch_size=config.batch_size, verbose=1)
        
        all_predictions.append(predictions)
    
  
    print("\nAveraging ensemble predictions...")
    ensemble_predictions = np.mean(all_predictions, axis=0)
    
    
    ensemble_threshold = np.mean(thresholds)
    print(f"Using ensemble threshold: {ensemble_threshold:.2f}")
    
   
    results = []
    for i, protein_id in enumerate(tqdm(test_ids, desc="Processing ensemble predictions")):
        protein_probs = ensemble_predictions[i]
        go_indices = np.where(protein_probs > ensemble_threshold)[0]
        for idx in go_indices:
            results.append({
                "Id": protein_id,
                "GO term": labels_names[idx],
                "Confidence": float(protein_probs[idx])
            })
    
    submission_df = pd.DataFrame(results)
    print(f"ENSEMBLE PREDICTIONS COMPLETE. Generated {len(submission_df)} predictions.")
    
    return submission_df

In [12]:

def predict_in_batches(model, embeddings_source, threshold, batch_size=5000):

    print("\n=== BATCH PREDICTION (Memory-Efficient) ===")
    
   
    print("Loading test data...")
    X_test, test_ids = load_protein_data("test", embeddings_source)
    

    labels_df = pd.read_csv(os.path.join(config.MAIN_DIR, "Train/train_terms.tsv"), sep="\t")
    top_terms = labels_df.groupby("term")["EntryID"].count().sort_values(ascending=False)
    labels_names = top_terms.head(config.num_labels).index.values
    
    results = []
    n_batches = (len(X_test) + batch_size - 1) // batch_size
    
    print(f"Processing {len(X_test)} samples in {n_batches} batches...")
    
    for i in tqdm(range(n_batches), desc="Batch prediction"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(X_test))
        
        batch_X = X_test[start_idx:end_idx]
        batch_ids = test_ids[start_idx:end_idx]
        
    
        predictions = model.predict(batch_X, batch_size=128, verbose=0)
        
        
        for j, protein_id in enumerate(batch_ids):
            protein_probs = predictions[j]
            go_indices = np.where(protein_probs > threshold)[0]
            
            for idx in go_indices:
                results.append({
                    "Id": protein_id,
                    "GO term": labels_names[idx],
                    "Confidence": float(protein_probs[idx])
                })
        
     
        if i % 10 == 0:
            del predictions
            import gc
            gc.collect()
            tf.keras.backend.clear_session()
    
    submission_df = pd.DataFrame(results)
    print(f"BATCH PREDICTIONS COMPLETE. Generated {len(submission_df)} predictions.")
    
    return submission_df

In [13]:

def train_with_kfold(embeddings_source, n_folds=5, use_augmentation=True):
  
    from sklearn.model_selection import KFold
    
    print("\n" + "="*70)
    print(f"TRAINING WITH {n_folds}-FOLD CROSS-VALIDATION")
    print("="*70)
    
   
    print("\nLoading training data...")
    X_full, y_full, ids = load_protein_data("train", embeddings_source)
    print(f"Total samples: {len(X_full)}")
    
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    models = []
    thresholds = []
    histories = []
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_full)):
        print("\n" + "="*70)
        print(f"FOLD {fold + 1}/{n_folds}")
        print("="*70)
        
      
        X_train, X_val = X_full[train_idx], X_full[val_idx]
        y_train, y_val = y_full[train_idx], y_full[val_idx]
        
      
        if use_augmentation:
            print("Applying data augmentation...")
            X_train, y_train = augment_embeddings(X_train, y_train, augment_factor=0.2)
            print(f"Augmented training samples: {len(X_train)}")
        
        print(f"Training: {len(X_train)}, Validation: {len(X_val)}")
        
      
        model = build_cnn_model(
            input_dim=embeds_dim[embeddings_source],
            num_classes=config.num_labels
        )
        
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=0.0005),
            loss=focal_loss(gamma=2.0, alpha=0.25),
            metrics=[
                MultilabelF1Score(num_labels=config.num_labels, threshold=0.1),
                keras.metrics.AUC(name='auc', multi_label=True)  
            ]
        )  
    
        checkpoint = ModelCheckpoint(
            'best_model.keras',
            monitor='val_auc',  
            save_best_only=True,
            mode='max',
            verbose=1
        )
        
        early_stop = EarlyStopping(
            monitor='val_auc',  
            patience=5,
            mode='max',
            restore_best_weights=True,
            verbose=1
        )
        
       
        history = model.fit(
            X_train, y_train,
            batch_size=config.batch_size,
            epochs=config.n_epochs,
            validation_data=(X_val, y_val),
            callbacks=[checkpoint, early_stop],
            verbose=1
        )
        

        
        
       
        best_f1, best_thresh = find_best_threshold(model, X_val, y_val)
        
        print(f"\nFold {fold+1} Results:")
        print(f"  Best F1: {best_f1:.4f}")
        print(f"  Best Threshold: {best_thresh:.2f}")
        
        models.append(model)
        thresholds.append(best_thresh)
        histories.append(history)
        fold_scores.append(best_f1)
        
     
        tf.keras.backend.clear_session()
    
   
    print("\n" + "="*70)
    print("CROSS-VALIDATION SUMMARY")
    print("="*70)
    for fold, (score, thresh) in enumerate(zip(fold_scores, thresholds)):
        print(f"Fold {fold+1}: F1={score:.4f}, Threshold={thresh:.2f}")
    print(f"\nMean F1: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")
    print(f"Mean Threshold: {np.mean(thresholds):.2f} ± {np.std(thresholds):.4f}")
    print("="*70)
    
    return models, thresholds, histories

In [14]:
def train_single_fold_for_ensemble(embeddings_source, fold_num, train_size=0.9, seed=None):
   
    if seed is None:
        seed = fold_num * 42
    
    print(f"\n=== Training Ensemble Model {fold_num} (seed={seed}) ===")
    
   
    X_train_full, y_train_full, ids = load_protein_data("train", embeddings_source)
    

    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full,
        train_size=train_size,
        random_state=seed
    )
    
   
    print("Applying data augmentation...")
    X_train, y_train = augment_embeddings(X_train, y_train, augment_factor=0.2)
    print(f"Training samples: {len(X_train)}, Validation: {len(X_val)}")
    
  
    model = build_cnn_model(
        input_dim=embeds_dim[embeddings_source],
        num_classes=config.num_labels
    )
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.0005),
        loss=focal_loss(gamma=2.0, alpha=0.25),
        metrics=[
            MultilabelF1Score(num_labels=config.num_labels, threshold=0.1),
            keras.metrics.AUC(name='auc', multi_label=True)  # Add this
        ]
    )  

    checkpoint = ModelCheckpoint(
        'best_model.keras',
        monitor='val_auc',  
        save_best_only=True,
        mode='max',
        verbose=1
    )
    
    early_stop = EarlyStopping(
        monitor='val_auc',  
        patience=5,
        mode='max',
        restore_best_weights=True,
        verbose=1
    )
    
    history = model.fit(
        X_train, y_train,
        batch_size=config.batch_size,
        epochs=config.n_epochs,
        validation_data=(X_val, y_val),
        callbacks=[checkpoint, early_stop],
        verbose=1
    )
    
  
    model = keras.models.load_model(f'ensemble_model_{fold_num}.keras')
    best_f1, best_thresh = find_best_threshold(model, X_val, y_val)
    
    print(f"Model {fold_num} - F1: {best_f1:.4f}, Threshold: {best_thresh:.2f}")
    
    tf.keras.backend.clear_session()
    
    return model, best_thresh

### GENERATING PREDICTIONS 

In [15]:

print("Starting K-Fold Cross-Validation Training...")
kfold_models, kfold_thresholds, kfold_histories = train_with_kfold(
    embeddings_source="EMS2",
    n_folds=5,
    use_augmentation=True
)


submission_df = predict_ensemble(
    models=kfold_models,
    embeddings_source="EMS2",
    thresholds=kfold_thresholds,
    use_tta=True  
)

print("K-FOLD ENSEMBLE PREDICTION COMPLETE!")

Starting K-Fold Cross-Validation Training...

TRAINING WITH 5-FOLD CROSS-VALIDATION

Loading training data...
Total samples: 142246

FOLD 1/5
Applying data augmentation...
Augmented training samples: 136555
Training: 136555, Validation: 28450
Epoch 1/20
[1m2134/2134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - auc: 0.4988 - f1_score: 0.1081 - loss: 0.0979
Epoch 1: val_auc improved from -inf to 0.49798, saving model to best_model.keras
[1m2134/2134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 37ms/step - auc: 0.4988 - f1_score: 0.1081 - loss: 0.0979 - val_auc: 0.4980 - val_f1_score: 0.0963 - val_loss: 0.0186
Epoch 2/20
[1m2133/2134[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 30ms/step - auc: 0.4996 - f1_score: 0.1082 - loss: 0.0175
Epoch 2: val_auc did not improve from 0.49798
[1m2134/2134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 32ms/step - auc: 0.4996 - f1_score: 0.1082 - loss: 0.0175 - val_auc: 0.4979 - val_f1_score: 0.0972 -

Processing ensemble predictions: 100%|██████████| 141864/141864 [00:02<00:00, 64734.25it/s]


ENSEMBLE PREDICTIONS COMPLETE. Generated 2536781 predictions.
K-FOLD ENSEMBLE PREDICTION COMPLETE!


### SUBMISSION FILE GENERATION 

In [16]:
print("\nMerging submission files...")


submission2 = pd.read_csv('/kaggle/input/blast-quick-sprof-zero-pred/submission.tsv',
                          sep='\t', header=None, names=['Id', 'GO term', 'Confidence2'])


subs = pd.merge(submission_df, submission2, on=['Id', 'GO term'], how='outer')


subs['Confidence_combined'] = subs['Confidence2'].fillna(subs['Confidence'])


final_submission = subs[['Id', 'GO term', 'Confidence_combined']]
final_submission.to_csv('submission.tsv', sep='\t', header=False, index=False)

print("Submission file 'submission.tsv' created successfully!")
print(f"It contains {len(final_submission)} predictions in total.")


Merging submission files...
Submission file 'submission.tsv' created successfully!
It contains 13699025 predictions in total.


Acknowledgement: - [https://www.kaggle.com/code/momerer/cafa-6-protein-function-prediction-with-1d-cnn](https://www.kaggle.com/code/momerer/cafa-6-protein-function-prediction-with-1d-cnn)