# Example script for Hackathon

Within each cycle of active learning, you can:

1. Collect training data (original training data + your query data).

2. Train a prediction model to predict the DMS_score for each mutant (e.g., M0A).

3. Use the trained model to predict the score for all mutant in the test set.

4. Select query mutants for next round based on certain criteria. You may want to make sure you don't query the same mutant twice as you only have a limited chances of making queries in total.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, Dataset
import random
from copy import deepcopy
import pandas as pd
from scipy.stats import spearmanr
import argparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
!pip install -q transformers
!pip install -q accelerate
!pip install -q torch torchvision torchaudio

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/MLCB/Hackathon_data

/content/drive/MyDrive/MLCB/Hackathon_data


## 1. collect training data

Upload `sequence.fasta`, `train.csv`, and `test.csv` to the current runtime:

1. click the folder icon on the left

2. click the upload icon and upload the files to the current directory

In [None]:
with open('sequence.fasta', 'r') as f:
  data = f.readlines()


sequence_wt = data[1].strip()
sequence_wt

'MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLREKMRRRLESGDKWFSLEFFPPRTAEGAVNLISRFDRMAAGGPLYIDVTWHPAGDPGSDKETSSMMIASTAVNYCGLETILHMTCCRQRLEEITGHLHKAKQLGLKNIMALRGDPIGDQWEEEEGGFNYAVDLVKHIRSEFGDYFDICVAGYPKGHPEAGSFEADLKHLKEKVSAGADFIITQLFFEADTFFRFVKACTDMGITCPIVPGIFPIQGYHSLRQLVKLSKLEVPQEIKDVIEPIKDNDAAIRNYGIELAVSLCQELLASGLVPGLHFYTLNREMATTEVLKRLGMWTEDPRRPLPWALSAHPKRREEDVRPIFWASRPKSYIYRTQEWDEFPNGRWGNSSSPAFGELKDYYLFYLKSKSPKEELLKMWGEELTSEESVFEVFVLYLSGEPNRNGHKVTCLPWNDEPLAAETSLLKEELLRVNRQGILTINSQPNINGKPSSDPIVGWGPSGGYVFQKAYLEFFTSRETAEALLQVLKKYELRVNYHLVNVKGENITNAPELQPNAVTWGIFPGREIIQPTVVDPVSFMFWKDEAFALWIERWGKLYEEESPSRTIIQYIHDNYFLVNLVDNDFPLDNCLWQVVEDTLELLNRPTQNARETEAP'

In [None]:
len(sequence_wt)

656

In [None]:
656*19


12464

In [None]:
def get_mutated_sequence(mut, sequence_wt):
  wt, pos, mt = mut[0], int(mut[1:-1]), mut[-1]

  sequence = deepcopy(sequence_wt)

  return sequence[:pos]+mt+sequence[pos+1:]

In [None]:
query1 = pd.read_csv('query_1.csv')
query1.head(5)

Unnamed: 0,mutant,DMS_score,sequence
0,K355Q,0.913747,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,R356P,0.574405,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,R357G,0.378047,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,F365E,0.468621,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
4,S372M,0.64072,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


In [None]:
query2 = pd.read_csv('query_2.csv')
query2.head(5)

Unnamed: 0,mutant,DMS_score,sequence
0,G6F,0.796447,MVNEARFNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,S9E,0.697405,MVNEARGNSELNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,N11M,0.805347,MVNEARGNSSLMPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,G16R,0.738921,MVNEARGNSSLNPCLERSASSGSESSKDSSRCSTPGLDPERHERLR...
4,S19P,0.91512,MVNEARGNSSLNPCLEGSAPSGSESSKDSSRCSTPGLDPERHERLR...


In [None]:
query3 = pd.read_csv('query_3.csv')
query3.head(5)

Unnamed: 0,mutant,DMS_score,sequence
0,L350E,0.737047,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,S351D,0.738005,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,H353Q,0.836147,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,H353K,0.699721,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
4,R357P,0.68122,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


In [None]:
train = pd.read_csv('train.csv')
train

Unnamed: 0,mutant,DMS_score
0,M0Y,0.2730
1,M0W,0.2857
2,M0V,0.2153
3,M0T,0.3122
4,M0S,0.2180
...,...,...
1135,P347D,0.3876
1136,P347C,0.1837
1137,P347A,0.4611
1138,P347M,0.2412


In [None]:
# import pandas as pd

# # Read CSV
# strain = pd.read_csv('train.csv')

# # Randomly select 500 rows
# train = strain.sample(n=500, random_state=42)  # Set seed for reproducibility

# # Optional: reset index
# train = train.reset_index(drop=True)

# # Preview
# print(train.head())

In [None]:
df_test = pd.read_csv('test.csv')


In [None]:

train['sequence'] = train.mutant.apply(lambda x: get_mutated_sequence(x, sequence_wt))
train

Unnamed: 0,mutant,DMS_score,sequence
0,M0Y,0.2730,YVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,M0W,0.2857,WVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,M0V,0.2153,VVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,M0T,0.3122,TVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
4,M0S,0.2180,SVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
...,...,...,...
1135,P347D,0.3876,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1136,P347C,0.1837,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1137,P347A,0.4611,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1138,P347M,0.2412,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


In [None]:
df_train = pd.concat([train, query1, query2, query3], ignore_index=True)

In [None]:
df_train

Unnamed: 0,mutant,DMS_score,sequence
0,M0Y,0.273000,YVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,M0W,0.285700,WVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,M0V,0.215300,VVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,M0T,0.312200,TVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
4,M0S,0.218000,SVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
...,...,...,...
1435,T638R,0.412769,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1436,E640A,0.966497,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1437,L641C,0.823889,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1438,T646F,0.934512,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


In [None]:

df_test['sequence'] = df_test.mutant.apply(lambda x: get_mutated_sequence(x, sequence_wt))
df_test

Unnamed: 0,mutant,sequence
0,V1D,MDNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,V1Y,MYNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,V1C,MCNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,V1A,MANEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
4,V1E,MENEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
...,...,...
11319,P655S,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
11320,P655T,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
11321,P655V,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
11322,P655A,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


In [None]:
# TODO: integrate the query data that you acquired each round into df_train

## 2. Train a prediction model

Here, we provided a linear regression model and used one-hot encoding to encode each variant. You would need to build your own model to achieve better performances.

Hint: you can perform cross-validation on the training set to evaluate your predictor before making predictions on the test set.

In [None]:
df_train.sequence.values


array(['YVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLREKMRRRLESGDKWFSLEFFPPRTAEGAVNLISRFDRMAAGGPLYIDVTWHPAGDPGSDKETSSMMIASTAVNYCGLETILHMTCCRQRLEEITGHLHKAKQLGLKNIMALRGDPIGDQWEEEEGGFNYAVDLVKHIRSEFGDYFDICVAGYPKGHPEAGSFEADLKHLKEKVSAGADFIITQLFFEADTFFRFVKACTDMGITCPIVPGIFPIQGYHSLRQLVKLSKLEVPQEIKDVIEPIKDNDAAIRNYGIELAVSLCQELLASGLVPGLHFYTLNREMATTEVLKRLGMWTEDPRRPLPWALSAHPKRREEDVRPIFWASRPKSYIYRTQEWDEFPNGRWGNSSSPAFGELKDYYLFYLKSKSPKEELLKMWGEELTSEESVFEVFVLYLSGEPNRNGHKVTCLPWNDEPLAAETSLLKEELLRVNRQGILTINSQPNINGKPSSDPIVGWGPSGGYVFQKAYLEFFTSRETAEALLQVLKKYELRVNYHLVNVKGENITNAPELQPNAVTWGIFPGREIIQPTVVDPVSFMFWKDEAFALWIERWGKLYEEESPSRTIIQYIHDNYFLVNLVDNDFPLDNCLWQVVEDTLELLNRPTQNARETEAP',
       'WVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLREKMRRRLESGDKWFSLEFFPPRTAEGAVNLISRFDRMAAGGPLYIDVTWHPAGDPGSDKETSSMMIASTAVNYCGLETILHMTCCRQRLEEITGHLHKAKQLGLKNIMALRGDPIGDQWEEEEGGFNYAVDLVKHIRSEFGDYFDICVAGYPKGHPEAGSFEADLKHLKEKVSAGADFIITQLFFEADTFFRFVKACTDMGITCPIVPGIFPIQGYHSLRQLVKLSKLEVPQEIKDVIEPIKDNDAAIRNYGIELAVSLCQELLASGLVPGLHFYTLNR

In [None]:
'''hyperparameters'''

seq_length = 656
seed = 0 # seed for splitting the validation set
val_ratio = 0.1 # proportion of validation set

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from transformers import EsmModel, EsmTokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
# model_name = "facebook/esm1v_t33_650M_UR90S_1"v1
# model_name = "facebook/esm1_t6_43M_UR50S" #esm 1
# model_name = "facebook/esm2_t6_8M_UR50D"
model_name = "facebook/esm2_t30_150M_UR50D"
# model_name = "facebook/esm1v_t33_650M_UR90S_1"

tokenizer = EsmTokenizer.from_pretrained(model_name)
esm_model = EsmModel.from_pretrained(model_name).to(device)
esm_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t30_150M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 640, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
    (position_embeddings): Embedding(1026, 640, padding_idx=1)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-29): 30 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=640, out_features=640, bias=True)
            (key): Linear(in_features=640, out_features=640, bias=True)
            (value): Linear(in_features=640, out_features=640, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=640, out_features=640, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmIntermediate(
  

In [None]:
import numpy as np
from tqdm import tqdm

@torch.no_grad()
def embed_sequence_esm2(sequence: str):
    # Add special tokens and tokenize
    inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=True).to(device)

    # Get embeddings from ESM
    outputs = esm_model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # (seq_len + 2, hidden_dim)

    # Remove special tokens (<cls> and <eos>)
    token_embeddings = token_embeddings[1:-1]

    # Convert to numpy
    return token_embeddings.cpu().numpy()  # shape: (L, D)


In [None]:
import numpy as np
from tqdm import tqdm

@torch.no_grad()
def embed_sequence_esmv1(sequence: str):
    # Add special tokens and tokenize
    inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=True).to(device)

    # Get embeddings from ESM
    outputs = esm_model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # (seq_len + 2, hidden_dim)

    # Remove special tokens (<cls> and <eos>)
    token_embeddings = token_embeddings[1:-1]

    # Convert to numpy
    return token_embeddings.cpu().numpy()  # shape: (L, D)


In [None]:
from torch.nn.utils.rnn import pad_sequence

def pad_collate(batch):
    sequences, targets = zip(*batch)  # Each sequence is (L, D)
    padded = pad_sequence(sequences, batch_first=True)  # (B, L_max, D)
    targets = torch.stack(targets)  # (B,)
    return padded, targets


In [None]:
from torch.nn.utils.rnn import pad_sequence

def pad_collate(batch):
    sequences, targets = zip(*batch)  # Each sequence is (L, D)

    # Pad sequences to (B, L_max, D)
    padded = pad_sequence(sequences, batch_first=True)  # (B, L_max, D)

    # Add channel dimension for Conv2D → (B, 1, L_max, D)
    padded = padded.unsqueeze(1)

    targets = torch.stack(targets)  # (B,)
    return padded, targets


In [None]:
import torch
from torch.utils.data import Dataset
from tqdm import tqdm

class ProteinDatasetESM(Dataset):
    def __init__(self, df, istrain=True, normalize=True, use_esm2=True): #model to be use dand normalise
        """
        Args:
            df (pd.DataFrame): Must contain 'sequence' and optionally 'DMS_score'
            istrain (bool): If True, loads targets and computes normalization stats
            normalize (bool): Whether to apply z-score normalization
            use_esm2 (bool): If True, use embed_sequence_esm2, else use embed_sequence_esmv1
        """
        self.df = df.reset_index(drop=True)
        self.istrain = istrain
        self.normalize = normalize
        self.use_esm2 = use_esm2
        self.num_samples = len(df)
        self.normalize = False
        self.embeddings = []
        self.targets = torch.zeros(self.num_samples, dtype=torch.float32)

        print(f"Encoding sequences using {'ESM-2' if use_esm2 else 'ESM-1v'}...")

        # Encode sequences
        if istrain:
            for it, (seq, target) in enumerate(tqdm(self.df[['sequence', 'DMS_score']].values)):
                emb = embed_sequence_esm2(seq) if use_esm2 else embed_sequence_esmv1(seq)
                self.embeddings.append(torch.tensor(emb))  # shape (L, D)
                self.targets[it] = target
        else:
            for it, seq in enumerate(tqdm(self.df['sequence'].values)):
                emb = embed_sequence_esm2(seq) if use_esm2 else embed_sequence_esmv1(seq)
                self.embeddings.append(torch.tensor(emb))
                self.targets[it] = 0.0  # dummy

        # # Compute normalization stats (mean and std over all (L, D) values)
        # if self.normalize:
        #     all_tokens = torch.cat(self.embeddings, dim=0)  # concat over L, shape: (total_L, D)
        #     self.mean = all_tokens.mean(dim=0, keepdim=True)
        #     self.std = all_tokens.std(dim=0, keepdim=True)
        # else:
        #     self.mean = None
        #     self.std = None
        if self.normalize:
            print("here")
            count = 0
            mean = 0
            M2 = 0  # For Welford's algorithm (std computation)

            for emb in self.embeddings:
                batch_size = emb.shape[0]
                count += batch_size

                delta = emb.mean(dim=0, keepdim=True) - mean
                mean += delta * batch_size / count

                # Sum of squared differences for variance (Welford-style)
                M2 += ((emb - mean)**2).sum(dim=0, keepdim=True)

            self.mean = mean
            self.std = torch.sqrt(M2 / count)
        else:
            self.mean = None
            self.std = None


    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        emb = self.embeddings[idx]  # (L, D)

        if self.normalize and self.mean is not None and self.std is not None:
            emb = (emb - self.mean) / (self.std + 1e-6)

        return emb, self.targets[idx]


In [None]:
11324+1140

12464

DEEP LEARNINF LINEAR LAYER


In [None]:
from torch.utils.data import Subset, DataLoader
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

# Define val split ratio and seed
val_ratio = 0.2
seed = 50

# Step 1: Load datasets with full (L, D) embeddings
train_dataset = ProteinDatasetESM(df_train, istrain=True)
test_dataset = ProteinDatasetESM(df_test, istrain=False)

# Step 2: Split by index (cannot use torch.stack on variable-length tensors)
indices = list(range(len(train_dataset)))
train_idx, val_idx = train_test_split(indices, test_size=val_ratio, random_state=seed)

train_subset = Subset(train_dataset, train_idx)
val_subset = Subset(train_dataset, val_idx)


Encoding sequences using ESM-2...


100%|██████████| 1440/1440 [00:55<00:00, 25.98it/s]


Encoding sequences using ESM-2...


100%|██████████| 11324/11324 [07:07<00:00, 26.51it/s]


FINDING TOP10

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import torch

# Step 1: Mean-pool the embeddings to get (N, D)
test_embeddings = []

for i in range(len(test_dataset)):
    emb, _ = test_dataset[i]  # emb: (L, D)
    emb_pooled = emb.mean(dim=0)  # shape: (D,)
    test_embeddings.append(emb_pooled.numpy())

X_test = np.stack(test_embeddings)  # shape: (N_test, D)

# Step 2: Perform PCA
pca = PCA(n_components=2)  # or more components if desired
X_test_pca = pca.fit_transform(X_test)

print(f"PCA reduced shape: {X_test_pca.shape}")


PCA reduced shape: (11324, 2)


In [None]:
X_test_pca

array([[-0.06251398, -0.04178166],
       [-0.02578768,  0.00626814],
       [-0.01657686, -0.04770505],
       ...,
       [-0.02435693,  0.0178352 ],
       [-0.00219178,  0.05434316],
       [-0.06265402,  0.0435102 ]], dtype=float32)

In [None]:
pred = pd.read_csv("/content/drive/MyDrive/MLCB/Hackathon_data/39_predictions.csv")
pred.head(5)

Unnamed: 0.1,Unnamed: 0,mutant,DMS_score_predicted
0,0,V1D,0.268795
1,1,V1Y,0.229811
2,2,V1C,0.228745
3,3,V1A,0.210878
4,4,V1E,0.227662


In [None]:
import pandas as pd

# Step 1: Make sure PCA and prediction rows align
assert len(pred) == X_test_pca.shape[0], "Mismatch in number of samples!"

# Step 2: Create PCA DataFrame
df_pca = pd.DataFrame(X_test_pca, columns=["PC1", "PC2"])

# Step 3: Concatenate with prediction DataFrame
df_combined = pd.concat([pred.reset_index(drop=True), df_pca], axis=1)

# Optional: Save or view
df_combined.to_csv("pca_with_predictions.csv", index=False)
df_combined.head()


Unnamed: 0.1,Unnamed: 0,mutant,DMS_score_predicted,PC1,PC2
0,0,V1D,0.268795,-0.062514,-0.041782
1,1,V1Y,0.229811,-0.025788,0.006268
2,2,V1C,0.228745,-0.016577,-0.047705
3,3,V1A,0.210878,-0.082633,-0.052886
4,4,V1E,0.227662,-0.064264,-0.106119


In [None]:
from sklearn.cluster import KMeans

# Use only PCA columns for clustering
X_pca = df_combined[["PC1", "PC2"]].values

# Run KMeans with 10 clusters
kmeans = KMeans(n_clusters=10, random_state=42, n_init="auto")
cluster_labels = kmeans.fit_predict(X_pca)

# Add cluster labels to the DataFrame
df_combined["cluster"] = cluster_labels


In [None]:
df_combined.to_csv("pca_with_predictions.csv", index=False)

In [None]:
df_combined.sort_values('DMS_score_predicted', ascending=False).reset_index(drop=True)

Unnamed: 0.1,Unnamed: 0,mutant,DMS_score_predicted,PC1,PC2,cluster
0,11094,N643W,0.921643,0.013074,-0.121483,3
1,8609,E513Y,0.787858,0.000488,-0.033633,1
2,9292,N549W,0.742141,-0.033607,-0.099036,3
3,9298,N549I,0.692308,-0.008963,-0.091999,3
4,8899,L528P,0.689399,-0.005100,0.002477,5
...,...,...,...,...,...,...
11319,3788,D222F,-0.063020,-0.025407,-0.000378,7
11320,3781,D222V,-0.063690,-0.043523,-0.000410,7
11321,3791,D222I,-0.064277,-0.036516,0.011277,7
11322,3799,D222L,-0.064428,-0.033669,0.005012,7


In [None]:
top_per_cluster


Unnamed: 0.1,Unnamed: 0,mutant,DMS_score_predicted,PC1,PC2,cluster
11094,11094,N643W,0.921643,0.013074,-0.121483,3
8609,8609,E513Y,0.787858,0.000488,-0.033633,1
8899,8899,L528P,0.689399,-0.0051,0.002477,5
6815,6815,K418F,0.626406,-0.043696,-0.008975,7
6851,6851,W420I,0.544549,0.047092,-0.019545,8
10219,10219,L597E,0.542112,-0.001725,0.018577,0
10210,10210,L597Q,0.5377,-0.010129,0.019195,9
7037,7037,V430P,0.430937,0.035881,0.028218,4
346,346,S19M,0.359584,0.274792,-0.006384,2
453,453,S24M,0.333015,0.149139,-0.027182,6


In [None]:
# 1. Select top mutant from each of the 10 clusters
top_per_cluster = df_combined.sort_values('DMS_score_predicted', ascending=False).groupby('cluster').head(1)

# 2. Sort them again for consistency (optional)
top10_df = top_per_cluster.sort_values('DMS_score_predicted', ascending=False).head(10)

# 3. Write each mutant to a line in the file
with open("cluster_top10.txt", "w") as f:
    for mutant in top10_df["mutant"]:
        f.write(f"{mutant}\n")


In [None]:
# Sort by DMS score (descending) and take top 10 mutants
top10_df = df_combined.sort_values('DMS_score_predicted', ascending=False).head(10)
mutants = top10_df['mutant'].tolist()

# Write to file (no trailing newline after last line)
with open("top10_0.39_dms.txt", "w") as f:
    for i, mutant in enumerate(mutants):
        if i < len(mutants) - 1:
            f.write(f"{mutant}\n")
        else:
            f.write(f"{mutant}")


In [None]:
# import torch

# def save_protein_dataset(dataset, filename):
#     embeddings = []
#     targets = []

#     for i in range(len(dataset)):
#         emb, target = dataset[i]  # emb: (Lᵢ, D), target: scalar
#         embeddings.append(emb)
#         targets.append(target)

#     targets = torch.tensor(targets, dtype=torch.float32)

#     torch.save({
#         'embeddings': embeddings,   # list of tensors (Lᵢ, D)
#         'targets': targets          # tensor (N,)
#     }, filename)

# # Save training and test datasets
# save_protein_dataset(train_dataset, "protein_train_full.pt")
# save_protein_dataset(test_dataset, "protein_test_full.pt")


In [None]:
# import numpy as np

# # Get mean-pooled embeddings (L, D) → (D,)
# train_embeddings = np.stack([emb.mean(dim=0).numpy() for emb in train_dataset.embeddings])
# train_targets = train_dataset.targets.numpy()

# test_embeddings = np.stack([emb.mean(dim=0).numpy() for emb in test_dataset.embeddings])
# np.savez("protein_trainq.npz", embeddings=train_embeddings, targets=train_targets)
# np.savez("protein_testq.npz", embeddings=test_embeddings)


In [None]:
# import numpy as np
# import torch
# from torch.utils.data import TensorDataset, DataLoader, Subset
# from sklearn.model_selection import train_test_split
# from torch.utils.data import Dataset
# val_ratio = 0.2
# seed = 50
# # Load train and test data
# train_data = np.load("protein_trainq.npz")
# test_data = np.load("protein_testq.npz")

# # Extract arrays
# train_embeddings = torch.tensor(train_data['embeddings'], dtype=torch.float32)  # (N, D)
# train_targets = torch.tensor(train_data['targets'], dtype=torch.float32)        # (N,)
# test_embeddings = torch.tensor(test_data['embeddings'], dtype=torch.float32)    # (M, D)


# # Wrap into standard TensorDataset
# full_train_dataset = TensorDataset(train_embeddings, train_targets)

# # Dummy targets for test set if you're only doing prediction
# test_dataset = TensorDataset(test_embeddings, torch.zeros(len(test_embeddings)))
# train_idx, val_idx = train_test_split(
#     np.arange(len(full_train_dataset)), test_size=val_ratio, random_state=seed
# )

# train_subset = Subset(full_train_dataset, train_idx)
# val_subset = Subset(full_train_dataset, val_idx)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

READ EMB

In [None]:
# from torch.utils.data import Dataset

# class LoadedProteinDataset(Dataset):
#     def __init__(self, embeddings, targets):
#         self.embeddings = embeddings  # list of (Lᵢ, D) tensors
#         self.targets = targets        # tensor of shape (N,)

#     def __len__(self):
#         return len(self.embeddings)

#     def __getitem__(self, idx):
#         return self.embeddings[idx], self.targets[idx]
# # For test set (dummy targets)
# class TestProteinDataset(Dataset):
#     def __init__(self, embeddings):
#         self.embeddings = embeddings

#     def __len__(self):
#         return len(self.embeddings)

#         return self.embeddings[idx], 0.0  # dummy target




In [None]:
# import numpy as np
# import torch
# from torch.utils.data import TensorDataset, DataLoader, Subset
# from sklearn.model_selection import train_test_split


# import torch
# import numpy as np
# from torch.utils.data import Subset
# from sklearn.model_selection import train_test_split

# # Load the saved train dataset
# train_data = torch.load("protein_train_full.pt")
# embeddings = train_data['embeddings']  # list of (Lᵢ, D) tensors
# targets = train_data['targets']        # (N,) tensor

# # Wrap into a dataset
# train_dataset = LoadedProteinDataset(embeddings, targets)
# # Define split ratio and seed
# val_ratio = 0.2
# seed = 42

# # Generate train/val indices
# train_idx, val_idx = train_test_split(
#     np.arange(len(train_dataset)), test_size=val_ratio, random_state=seed
# )

# # Create subsets
# train_subset = Subset(train_dataset, train_idx)
# val_subset = Subset(train_dataset, val_idx)



# train_idx, val_idx = train_test_split(np.arange(len(train_dataset)), test_size=val_ratio, random_state=seed)
# train_subset = Subset(train_dataset, train_idx)
# val_subset = Subset(train_dataset, val_idx)
# test_data = torch.load("protein_test_full.pt")
# test_dataset = TestProteinDataset(test_data['embeddings'])

In [None]:
train_dataset = full

AttributeError: 'NpzFile' object has no attribute 'shape'

AttributeError: 'tuple' object has no attribute 'shape'

In [None]:
tt, labels = next(iter(train_dataset))

# For a single feature tensor (e.g., image)
print(f"Min: {tt.min()}, Max: {tt.max()}")

Min: -3.6210861206054688, Max: 28.5362606048584


In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_subset, batch_size=32, shuffle=True, collate_fn=pad_collate)
val_loader = DataLoader(val_subset, batch_size=32, shuffle=False, collate_fn=pad_collate)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=pad_collate)



In [None]:
# from torch.nn.utils.rnn import pad_sequence

# def pad_collatet(batch):
#     sequences, targets = zip(*batch)  # Each sequence is (L, D)
#     padded = pad_sequence(sequences, batch_first=True)  # (B, L_max, D)
#     # Wrap individual targets in tensors before stacking
#     targets = torch.stack([torch.tensor([t]) for t in targets])  # (B,)
#     return padded, targets
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=pad_collatet)

In [None]:
# import torch
# import torch.nn as nn
# from scipy.stats import spearmanr

# # Setup
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# loss_fn = nn.MSELoss()

# # Initialize model
# embedding_dim = next(iter(train_loader))[0].shape[-1]
# model = DMSPredictor(embedding_dim=embedding_dim, hidden_dim=256, num_layers=2, dropout=0.2).to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


DLL

In [None]:
import torch
import torch.nn as nn

class DMSPredictor(nn.Module):
    def __init__(self, embedding_dim, hidden_dim=256, num_layers=2, dropout=0.2):
        super(DMSPredictor, self).__init__()

        layers = []

        # Input layer
        layers.append(nn.Linear(embedding_dim, hidden_dim))
        layers.append(nn.LayerNorm(hidden_dim))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(dropout))

        # Hidden layers
        for _ in range(num_layers - 1):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.LayerNorm(hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))

        # Output layer
        layers.append(nn.Linear(hidden_dim, 1))

        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        """
        x: Tensor of shape (B, L, D) or (B, D)
        """
        if x.ndim == 3:
            x = x.mean(dim=1)  # (B, D)

        return self.mlp(x).squeeze(-1)  # (B,)


In [None]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class DMSPredictorCNN(nn.Module):
#     def __init__(self, embedding_dim, conv_layers=[(128, 5), (128, 3)], dropout=0.3, task='regression', num_classes=1):
#         """
#         Args:
#             embedding_dim (int): D, the ESM embedding size per residue.
#             conv_layers (list): Each tuple is (num_filters, kernel_size)
#             dropout (float): Dropout rate
#             task (str): 'regression' or 'classification'
#             num_classes (int): Required for classification
#         """
#         super(DMSPredictorCNN, self).__init__()

#         layers = []
#         in_channels = embedding_dim

#         for out_channels, kernel_size in conv_layers:
#             layers.append(nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size // 2))
#             layers.append(nn.BatchNorm1d(out_channels))
#             layers.append(nn.ReLU())
#             layers.append(nn.Dropout(dropout))
#             in_channels = out_channels

#         self.conv_block = nn.Sequential(*layers)
#         self.task = task

#         # Final linear layer
#         if task == 'regression':
#             self.output_layer = nn.Linear(in_channels, 1)
#         elif task == 'classification':
#             self.output_layer = nn.Linear(in_channels, num_classes)
#         else:
#             raise ValueError("task must be 'regression' or 'classification'")

#     def forward(self, x):
#         """
#         x: Tensor of shape (B, L, D)
#         """
#         x = x.transpose(1, 2)              # → (B, D, L) for Conv1d
#         x = self.conv_block(x)            # → (B, C, L)
#         x = F.adaptive_max_pool1d(x, 1)   # → (B, C, 1)
#         x = x.squeeze(-1)                 # → (B, C)

#         out = self.output_layer(x)        # → (B, 1) or (B, num_classes)
#         return out.squeeze(-1) if self.task == 'regression' else out


In [None]:
class DMSPredictorCNN2D(nn.Module):
    def __init__(self, in_channels=1, dropout=0.3, task='regression', num_classes=1):
        super(DMSPredictorCNN2D, self).__init__()

        self.task = task

        self.conv_block = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=(3, 3), padding=(1, 1)),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Conv2d(32, 64, kernel_size=(3, 3), padding=(1, 1)),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Conv2d(64, 128, kernel_size=(3, 3), padding=(1, 1)),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))  # Output shape: (B, 128, 1, 1)
        )

        self.output_layer = nn.Linear(128, 1 if task == 'regression' else num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # (B, 1, L, D)
        x = self.conv_block(x)  # → (B, 128, 1, 1)
        x = x.view(x.size(0), -1)  # Flatten → (B, 128)
        out = self.output_layer(x)
        return out.squeeze(-1) if self.task == 'regression' else out


In [None]:
conv_layers = [
    (256, 9),   # Layer 1: 256 filters, kernel size 9
    (256, 7),   # Layer 2
    (128, 5),   # Layer 3
    (128, 3),   # Layer 4
    (64, 3),    # Layer 5
    (64, 3)     # Layer 6
]
# conv_layers = [
#     (256, 9),  # Wide first layer for long-range patterns
#     (128, 5),  # Mid-sized kernel
#     (64, 3)    # Narrow layer for fine-grained features
# ]
# conv_layers = [
#     (256, 9),
#     (256, 7),
#     (128, 5),
#     (128, 3),
#     (64, 3),
#     (64, 3)
# ]
# conv_layers = [
#     (256, 9),
#     (256, 7),
#     (128, 5),
#     (128, 3),
#     (64, 3),
#     (64, 3),
#     (32, 3),   # New Layer 7
#     (16, 3)    # New Layer 8
# ]


In [None]:
# input_dim = next(iter(train_loader))[0].shape[-1]
model = DMSPredictorCNN(
    embedding_dim=640,         # or 320,768 / 1280 depending on your ESM model
    conv_layers=conv_layers,
    dropout=0.2,

    task='regression'          # or 'classification', with num_classes if needed
).to(device)

loss_fn = nn.MSELoss()

In [None]:
model = DMSPredictorCNN2D(
    in_channels=1,
    dropout=0.3,
    task='regression'  # or 'classification'
).to(device)


LSTM

In [None]:
# import torch
# import torch.nn as nn

# class DMSPredictorLSTM(nn.Module):
#     def __init__(self, embedding_dim, hidden_dim=128, num_layers=2, dropout=0.3, bidirectional=True):
#         """
#         Args:
#             embedding_dim (int): D — ESM embedding dimension
#             hidden_dim (int): Hidden size of LSTM
#             num_layers (int): Number of LSTM layers
#             dropout (float): Dropout between LSTM layers
#             bidirectional (bool): Use bidirectional LSTM
#         """
#         super(DMSPredictorLSTM, self).__init__()
#         self.lstm = nn.LSTM(
#             input_size=embedding_dim,
#             hidden_size=hidden_dim,
#             num_layers=num_layers,
#             dropout=dropout if num_layers > 1 else 0.0,
#             bidirectional=bidirectional,
#             batch_first=True
#         )

#         lstm_output_dim = hidden_dim * (2 if bidirectional else 1)
#         self.fc = nn.Linear(lstm_output_dim, 1)

#     def forward(self, x):
#         """
#         x: Tensor of shape (B, L, D)
#         """
#         lstm_out, _ = self.lstm(x)              # (B, L, H)
#         last_hidden = lstm_out[:, -1, :]        # Use final output token (B, H)
#         out = self.fc(last_hidden)              # (B, 1)
#         return out.squeeze(-1)                  # (B,)


In [None]:
# sample_batch = next(iter(train_loader))[0]

# model = DMSPredictorLSTM(
#     embedding_dim=sample_batch.shape[-1],     # or 768 / 1280 depending on ESM
#     hidden_dim=128,
#     num_layers=10,
#     dropout=0.3,
#     bidirectional=True
# ).to(device)

# loss_fn = nn.MSELoss()


Train test

In [None]:
def train_epoch(model, loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for X, y in loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        preds = model(X)
        loss = loss_fn(preds, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X.size(0)
    return total_loss / len(loader.dataset)

def evaluate(model, loader, device, loss_fn=None):
    model.eval()
    preds_all, targets_all = [], []
    total_loss = 0
    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            preds = model(X)
            preds_all.extend(preds.cpu().numpy())
            targets_all.extend(y.cpu().numpy())

            if loss_fn:
                loss = loss_fn(preds, y)
                total_loss += loss.item() * X.size(0)

    rho, _ = spearmanr(targets_all, preds_all)
    avg_loss = total_loss / len(loader.dataset) if loss_fn else None
    return rho, avg_loss, preds_all, targets_all


In [None]:
# import torch
# import torch.nn as nn
# from scipy.stats import spearmanr

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Auto-detect embedding dimension from batch
# sample_batch = next(iter(train_loader))[0]

# # input_dim = sample_batch.shape[-1]
# # model = DMSPredictor(input_dim, hidden_dim=256, num_layers=8, dropout=0).to(device)

# # input_dim = next(iter(train_loader))[0].shape[-1]
# # model = DMSPredictorCNN(
# #     embedding_dim=input_dim,            # same as D in (L, D)
# #     conv_layers=conv_layers,   # customizable
# #     dropout=0.3,
# #     task='regression'                   # or 'classification' if needed
# # ).to(device)

# # # Optimizer and loss
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# loss_fn = nn.MSELoss()

# best_val_rho = -1
# best_model_state = None

# num_epochs = 50
# early_stop_patience = 10
# patience_counter = 0

# for epoch in range(1, num_epochs + 1):
#     train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device)
#     val_rho, val_loss, _, _ = evaluate(model, val_loader, device, loss_fn)

#     print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Spearman ρ: {val_rho:.4f}")

#     if val_rho > best_val_rho:
#         best_val_rho = val_rho
#         best_model_state = model.state_dict()
#         patience_counter = 0
#     else:
#         patience_counter += 1

#     # Optional: Early stopping
#     if patience_counter >= early_stop_patience:
#         print(f"⏹️ Early stopping triggered at epoch {epoch}")
#         break

# # Load best model before testing
# model.load_state_dict(best_model_state)


In [None]:
# test_rho, test_loss, test_preds, test_targets = evaluate(model, test_loader, device, loss_fn)
# print(f"Final Test Spearman ρ = {test_rho:.4f}, Test Loss = {test_loss:.4f}")


Final Test Spearman ρ = nan, Test Loss = 0.0044


  rho, _ = spearmanr(targets_all, preds_all)


In [None]:
model.eval()
y_test_pred = []

with torch.no_grad():
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        preds = model(X_batch)
        y_test_pred.extend(preds.cpu().numpy())

FINAL RESULTS


In [None]:
df_test['DMS_score_predicted'] = y_test_pred
df_test



Unnamed: 0,mutant,sequence,DMS_score_predicted
0,V1D,MDNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.351017
1,V1Y,MYNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.351534
2,V1C,MCNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.350759
3,V1A,MANEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.350950
4,V1E,MENEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.348138
...,...,...,...
11319,P655S,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.350717
11320,P655T,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.351852
11321,P655V,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.351509
11322,P655A,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.349968


In [None]:
df_test[['mutant', 'DMS_score_predicted']].to_csv('lpredictions.csv')

In [None]:
df_test

Unnamed: 0,mutant,sequence,DMS_score_predicted
0,V1D,MDNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.265076
1,V1Y,MYNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.295726
2,V1C,MCNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.282834
3,V1A,MANEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.245963
4,V1E,MENEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.249447
...,...,...,...
11319,P655S,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.326763
11320,P655T,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.291519
11321,P655V,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.310278
11322,P655A,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.359911


## 3. Select query for next round

In [None]:
df_test.sort_values('DMS_score_predicted', ascending=False).head(100)

Unnamed: 0,mutant,sequence,DMS_score_predicted
4537,L273T,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.429854
4522,L273A,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.418521
4523,L273C,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.415500
4533,L273S,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.413150
4284,G260N,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.401102
...,...,...,...
4823,A291V,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.360348
4285,G260M,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.360204
4929,S303M,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.360185
4812,A291H,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.360180


In [None]:
# dftest.to_csv("pred_test.csv")

In [None]:
# 1. Sort test set by predicted score (descending)
top10_df = df_test.sort_values('DMS_score_predicted', ascending=False).head(10)
top10_mutants = top10_df['mutant'].tolist()

with open("ltop10.txt", "w") as f:
    for i, mutant in enumerate(top10_mutants):
        if i < len(top10_mutants) - 1:
            f.write(f"{mutant}\n")
        else:
            f.write(f"{mutant}")  # no newline after last line

In [None]:
df_test

Unnamed: 0,mutant,sequence
0,V1D,MDNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,V1Y,MYNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,V1C,MCNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,V1A,MANEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
4,V1E,MENEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
...,...,...
11319,P655S,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
11320,P655T,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
11321,P655V,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
11322,P655A,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


K-Fold validation

In [None]:
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Subset

k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Store fold loaders
fold_loaders = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_dataset)):
    print(f"Fold {fold + 1}/{k_folds}")

    train_subset = Subset(train_dataset, train_idx)
    val_subset = Subset(train_dataset, val_idx)

    train_loader = DataLoader(train_subset, batch_size=32, shuffle=True, collate_fn=pad_collate)
    val_loader = DataLoader(val_subset, batch_size=32, shuffle=False, collate_fn=pad_collate)

    fold_loaders.append((train_loader, val_loader))


Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5


In [None]:
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# loss_fn = nn.MSELoss()

# best_val_rho = -1
# best_model_state = None

# num_epochs = 100
# early_stop_patience = 10
# patience_counter = 0
# for fold, (train_loader, val_loader) in enumerate(fold_loaders):
#     print(f"\n🌀 Training on Fold {fold + 1}")

#     for epoch in range(num_epochs):
#         train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device)
#         val_rho, _, _, _ = evaluate(model, val_loader, device) # unpacks all 4 values
#         print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Spearman = {val_rho:.4f}")

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

num_epochs = 100
early_stop_patience = 10

for fold, (train_loader, val_loader) in enumerate(fold_loaders):
    print(f"\n🌀 Training on Fold {fold + 1}")

    best_val_loss = float("inf")
    best_val_rho = -1
    best_model_state = None
    patience_counter = 0

    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device)
        val_rho, val_loss, _, _ = evaluate(model, val_loader, device, loss_fn)

        print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Spearman = {val_rho:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_rho = val_rho
            best_model_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter > early_stop_patience:
            print(f"⏹️ Early stopping at epoch {epoch + 1}")
            break



🌀 Training on Fold 1
Epoch 1: Train Loss = 0.8581, Val Loss = 0.1382, Val Spearman = 0.2009
Epoch 2: Train Loss = 0.1842, Val Loss = 0.1395, Val Spearman = 0.1977
Epoch 3: Train Loss = 0.1431, Val Loss = 0.0930, Val Spearman = 0.2890
Epoch 4: Train Loss = 0.1302, Val Loss = 0.1108, Val Spearman = 0.3649
Epoch 5: Train Loss = 0.1185, Val Loss = 0.0907, Val Spearman = 0.3152
Epoch 6: Train Loss = 0.1063, Val Loss = 0.0941, Val Spearman = 0.2912
Epoch 7: Train Loss = 0.1065, Val Loss = 0.0963, Val Spearman = 0.2690
Epoch 8: Train Loss = 0.1030, Val Loss = 0.0920, Val Spearman = 0.3229
Epoch 9: Train Loss = 0.1010, Val Loss = 0.0901, Val Spearman = 0.3824
Epoch 10: Train Loss = 0.1014, Val Loss = 0.0914, Val Spearman = 0.4308
Epoch 11: Train Loss = 0.1029, Val Loss = 0.0988, Val Spearman = 0.4219
Epoch 12: Train Loss = 0.0974, Val Loss = 0.0904, Val Spearman = 0.4313
Epoch 13: Train Loss = 0.0946, Val Loss = 0.0905, Val Spearman = 0.3536
Epoch 14: Train Loss = 0.0983, Val Loss = 0.0904, V