In [1]:
import os

import pandas as pd
from IPython.display import display

from tensorflow import keras

from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

In [2]:
# Step 1: Load your dataset
PROTEIN_ECOLI = 'ECOLI_18332_Proteins.csv'

# Load the dataset
protein_ecoli_data = pd.read_csv(PROTEIN_ECOLI).dropna().drop_duplicates()

In [4]:
# Step 3: Ensure every class is present in both train and validation sets
# Split the duplicated train_set into train and validation sets with stratification

train_set, temp_set = train_test_split(protein_ecoli_data, stratify = protein_ecoli_data['label'], test_size = 0.3, random_state = 42)
test_set, valid_set = train_test_split(temp_set, test_size = 0.5, random_state = 42)


print(f'{len(train_set)} training set records, {len(valid_set)} validation set records, {len(test_set)} test set records.')

12832 training set records, 2750 validation set records, 2750 test set records.


In [5]:
# Extract unique labels from your dataset
all_labels = set(set(train_set['label']).union(set(valid_set['label'])).union(set(test_set['label'])))

# Convert the set of all labels to a sorted list
UNIQUE_LABELS = sorted(list(all_labels))

# A local (non-global) binary output
OUTPUT_TYPE = OutputType(False, 'categorical')
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

In [7]:
# Step 3: Load Pre-trained ProteinBERT Model
pretrained_model_generator, input_encoder = load_pretrained_model()

# Step 4: Create the Fine-tuning Model
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, 
    pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs, dropout_rate=0.5)

# Step 5: Set up Callbacks for Training
training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True),
]

# Step 6: Fine-tune the Model
finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'], 
    seq_len=512, batch_size=32, max_epochs_per_stage=5, lr=1e-04, begin_with_frozen_pretrained_layers=True, 
    lr_with_frozen_pretrained_layers=1e-02, n_final_epochs=1, final_seq_len=1024, final_lr=1e-05, callbacks=training_callbacks)

# Step 7: Evaluate the Model on the Test Set
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_set['seq'], test_set['label'], start_seq_len=1024, start_batch_size=32)

print('Test-set performance:')
print(results)
print('Confusion matrix:')
print(confusion_matrix)

[2024_08_17-14:00:22] Training set: Filtered out 1565 of 12832 (12.2%) records of lengths exceeding 510.
[2024_08_17-14:00:23] Validation set: Filtered out 343 of 2750 (12.5%) records of lengths exceeding 510.
[2024_08_17-14:00:23] Training with frozen pretrained layers...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
[2024_08_17-14:48:17] Training the entire fine-tuned model...
[2024_08_17-14:48:38] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[2024_08_17-18:03:43] Training on final epochs of sequence length 1024...
[2024_08_17-18:03:43] Training set: Filtered out 161 of 12832 (1.3%) records of lengths exceeding 1022.
[2024_08_17-18:03:45] Validation set: Filtered out 34 of 2750 (1.2%) records of lengths exceeding 1022.
Test-set performance:
               # records  Accuracy
Model seq len                     
1024              