In [1]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Torch version: 2.5.0+cu124
CUDA available: True
Using device: cuda


In [2]:
import zipfile
import os

if not os.path.exists("data/"):
	# path to zip file
	zip_path = "cafa-6-protein-function-prediction.zip"

	# folder to extract to
	extract_dir = "data"

	# make sure the folder exists
	os.makedirs(extract_dir, exist_ok=True)

	# extract
	with zipfile.ZipFile(zip_path, 'r') as zip_ref:
		zip_ref.extractall(extract_dir)

	print(f"Extracted all files to: {extract_dir}")



In [3]:
from Bio import SeqIO
import pandas as pd

# Load sequences
train_sequences = list(SeqIO.parse("data/Train/train_sequences.fasta", "fasta"))
print(f"Total sequences: {len(train_sequences)}")

# Convert to DataFrame
seq_df = pd.DataFrame({
    "entry_id": [record.id for record in train_sequences],
    "sequence": [str(record.seq) for record in train_sequences],
})
seq_df["seq_len"] = seq_df["sequence"].str.len()
seq_df["entry_id"] = seq_df["entry_id"].str.extract(r"sp\|([^|]+)\|")[0]
#seq_df.head()

# Load taxonomy and GO terms
tax_df = pd.read_csv("data/Train/train_taxonomy.tsv", sep="\t",header=None)
tax_df.columns = ["entry_id", "taxonomy"]
terms_df = pd.read_csv("data/Train/train_terms.tsv", sep="\t")
terms_df.columns = terms_df.columns.str.lower()
terms_df.rename(columns={"entryid": "entry_id"}, inplace=True)

# Merge all
train_df =  (
    seq_df
    .merge(tax_df, on="entry_id", how="left")
    .merge(terms_df, on="entry_id", how="left")
)
train_df.head()


Total sequences: 82404


Unnamed: 0,entry_id,sequence,seq_len,taxonomy,term,aspect
0,A0A0C5B5G6,MRWQEMGYIFYPRKLR,16,9606,GO:0001649,P
1,A0A0C5B5G6,MRWQEMGYIFYPRKLR,16,9606,GO:0033687,P
2,A0A0C5B5G6,MRWQEMGYIFYPRKLR,16,9606,GO:0005615,C
3,A0A0C5B5G6,MRWQEMGYIFYPRKLR,16,9606,GO:0005634,C
4,A0A0C5B5G6,MRWQEMGYIFYPRKLR,16,9606,GO:0005739,C


In [4]:
from Bio import SeqIO
import pandas as pd

records = list(SeqIO.parse("data/Test/testsuperset.fasta", "fasta"))

entry_ids = []
tax_ids = []
sequences = []

for r in records:
    parts = r.description.split()
    entry_ids.append(parts[0])        # e.g., A0A0C5B5G6
    tax_ids.append(parts[1])          # e.g., 9606
    sequences.append(str(r.seq))

test_df = pd.DataFrame({
    "entry_id": entry_ids,
    "tax_id": tax_ids,
    "sequence": sequences
})

test_df["seq_len"] = test_df["sequence"].str.len()


In [5]:
# Check data sizes and overlaps
print(f"Test sequences: {test_df.shape[0]}")
print(f"Train protein-function pairs: {train_df.shape[0]}")
print(f"Unique proteins in train: {train_df['entry_id'].nunique()}")
print(f"Unique GO terms: {train_df['term'].nunique()}")

# Check for sequence overlap between train and test
train_proteins = set(train_df['entry_id'].unique())
test_proteins = set(test_df['entry_id'].unique())
overlap = train_proteins.intersection(test_proteins)
print(f"Proteins in both train and test: {len(overlap)}")

Test sequences: 224309
Train protein-function pairs: 537027
Unique proteins in train: 82404
Unique GO terms: 26125
Proteins in both train and test: 82404


In [6]:
# Analyze GO term frequency
term_counts = train_df['term'].value_counts()
print("GO term frequency distribution:")
print(term_counts.describe())

# Focus on frequent terms 
min_occurrence = 1
frequent_terms = term_counts[term_counts >= min_occurrence].index
print(f"Terms with ≥{min_occurrence} occurrences: {len(frequent_terms)}")

# Filter training data to frequent terms
filtered_train_df = train_df[train_df['term'].isin(frequent_terms)]

GO term frequency distribution:
count    26125.000000
mean        20.556057
std        268.143836
min          1.000000
25%          2.000000
50%          4.000000
75%         12.000000
max      33713.000000
Name: count, dtype: float64
Terms with ≥1 occurrences: 26125


In [7]:
import pandas as pd
import numpy as np
import torch
import esm
from Bio import SeqIO
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
import scipy.sparse as sp
import warnings
warnings.filterwarnings('ignore')

print("Loading ESM model...")
esm_model, alphabet = esm.pretrained.esm2_t30_150M_UR50D()
batch_converter = alphabet.get_batch_converter()
esm_model = esm_model.to(device)
esm_model.eval()
print("ESM model loaded successfully!")


Loading ESM model...
ESM model loaded successfully!


In [8]:
print("=== DATA SUMMARY ===")
print(f"Train df shape: {train_df.shape}")
print(f"Test df shape: {test_df.shape}")

print("\n=== TRAIN DATA PREVIEW ===")
print(train_df.head())
print("\n=== TEST DATA PREVIEW ===")
print(test_df.head())

print("\n=== COLUMN NAMES ===")
print(f"Train columns: {train_df.columns.tolist()}")
print(f"Test columns: {test_df.columns.tolist()}")

=== DATA SUMMARY ===
Train df shape: (537027, 6)
Test df shape: (224309, 4)

=== TRAIN DATA PREVIEW ===
     entry_id          sequence  seq_len  taxonomy        term aspect
0  A0A0C5B5G6  MRWQEMGYIFYPRKLR       16      9606  GO:0001649      P
1  A0A0C5B5G6  MRWQEMGYIFYPRKLR       16      9606  GO:0033687      P
2  A0A0C5B5G6  MRWQEMGYIFYPRKLR       16      9606  GO:0005615      C
3  A0A0C5B5G6  MRWQEMGYIFYPRKLR       16      9606  GO:0005634      C
4  A0A0C5B5G6  MRWQEMGYIFYPRKLR       16      9606  GO:0005739      C

=== TEST DATA PREVIEW ===
     entry_id tax_id                                           sequence  \
0  A0A0C5B5G6   9606                                   MRWQEMGYIFYPRKLR   
1  A0A1B0GTW7   9606  MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...   
2      A0JNW5   9606  MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...   
3      A0JP26   9606  MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...   
4      A0PK11   9606  MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...   

In [9]:
class ProteinDataset(Dataset):
    def __init__(self, df, max_length=1024):
        """
        df: DataFrame with 'entry_id' and 'sequence' columns
        max_length: Truncate sequences longer than this
        """
        self.df = df.reset_index(drop=True)
        self.max_length = max_length
        
    def __len__(self):
        return len(self.df)
    
    def __hash__(self):
        df_str = self.df.to_string(index=False, header=True)
        return hash(df_str)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        entry_id = row['entry_id']
        sequence = row['sequence'][:self.max_length]  # Truncate if too long
        return entry_id, sequence

import os
import torch
import pickle
from torch.utils.data import DataLoader

def get_esm_embeddings(dataset, batch_size=8, show_progress=True, cache_file="esm_embeddings.pkl"):
    """
    Extract ESM embeddings for all sequences in dataset.
    If embeddings are cached, load them from the cache file.
    
    Returns: dict {entry_id: embedding_vector}
    """
    # Check if the embeddings are already cached
    print("cache file: ", cache_file)
    if os.path.exists(cache_file):
        print(f"Loading cached embeddings from {cache_file}")
        with open(cache_file, 'rb') as f:
            embeddings = pickle.load(f)
        return embeddings
    
    print("Extracting embeddings...")

    # Initialize dataloader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    embeddings = {}

    if show_progress:
        from tqdm import tqdm
        iterator = tqdm(dataloader, desc="Extracting embeddings")
    else:
        iterator = dataloader
    
    with torch.no_grad():
        for batch in iterator:
            entry_ids, sequences = batch
            
            # Prepare batch for ESM
            batch_data = [(entry_id, sequence) for entry_id, sequence in zip(entry_ids, sequences)]
            batch_labels, batch_strs, batch_tokens = batch_converter(batch_data)
            batch_tokens = batch_tokens.to(device)
            
            # Get embeddings 
            results = esm_model(batch_tokens, repr_layers=[esm_model.num_layers], return_contacts=False)
            token_representations = results["representations"][esm_model.num_layers]
            
            # Create protein embedding (mean of all tokens except CLS and PAD)
            for i, entry_id in enumerate(entry_ids):
                seq_len = len(batch_strs[i])
                embedding = token_representations[i, 1:seq_len+1].mean(dim=0)  # Exclude CLS token
                embeddings[entry_id] = embedding.cpu().numpy()

            # Clear GPU memory
            torch.cuda.empty_cache()

    # Cache the embeddings for future use
    with open(cache_file, 'wb') as f:
        pickle.dump(embeddings, f)
        print(f"Embeddings cached to {cache_file}")
    
    return embeddings


def create_label_matrix(train_df, protein_list, term_list):
    """
    Create binary label matrix for multi-label classification
    Returns: sparse matrix (proteins x terms), protein_to_idx mapping
    """
    protein_to_idx = {pid: idx for idx, pid in enumerate(protein_list)}
    term_to_idx = {term: idx for idx, term in enumerate(term_list)}
    
    rows, cols = [], []
    for _, row in train_df.iterrows():
        if row['entry_id'] in protein_to_idx and row['term'] in term_to_idx:
            rows.append(protein_to_idx[row['entry_id']])
            cols.append(term_to_idx[row['term']])
    
    data = np.ones(len(rows))
    label_matrix = sp.csr_matrix((data, (rows, cols)), 
                                shape=(len(protein_list), len(term_list)))
    
    return label_matrix, protein_to_idx

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
import scipy.sparse as sp
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

class MultiOutputNN(nn.Module):
	def __init__(self, input_size, output_size, hidden_size=1000):
		super(MultiOutputNN, self).__init__()
		self.fc1 = nn.Linear(input_size, hidden_size)
		self.fc2 = nn.Linear(hidden_size, hidden_size)
		self.fc3 = nn.Linear(hidden_size, hidden_size)
		self.fc4 = nn.Linear(hidden_size, output_size)
		self.relu = nn.ReLU()
		self.sigmoid = nn.Sigmoid()
	
	def forward(self, x):
		x = self.relu(self.fc1(x))
		x = self.relu(self.fc2(x))
		x = self.relu(self.fc3(x))
		x = self.fc4(x)
		return self.sigmoid(x)


def training(train_df, test_df, term_counts, min_occurrence, batch_size=16):
	# Select GO terms with enough occurrences
	terms = term_counts[term_counts >= min_occurrence].index
	train = train_df[train_df['term'].isin(terms)]
		
	# Select proteins present in test_df
	# common_proteins = set(train['entry_id']).intersection(set(test_df['entry_id']))
	proteins = list(set(train['entry_id']))
		
	train = train[train['entry_id'].isin(proteins)]
	train = train.drop_duplicates(subset=['entry_id'])
	test = test_df[test_df['entry_id'].isin(proteins)]
	
	print(f"Training examples: {len(train)}")
	print(f"Unique proteins: {len(proteins)}")
	print(f"GO terms: {len(terms)}")
	# Create dataset and extract embeddings
	print("Creating ProteinDataset and extracting embeddings...")
	dataset = ProteinDataset(test)
	embeddings = get_esm_embeddings(dataset, batch_size=1, cache_file=f"generated/esm_embeddings_train.pkl")
		
	# Build sparse label matrix
	terms_2 = term_counts[(term_counts > 20)].index
	terms = terms_2
	y_train, _protein_map = create_label_matrix(train, proteins, terms)
	X_train = np.array([embeddings[pid] for pid in proteins])
	
	print(f"Feature matrix: X={X_train.shape}, Label matrix: y={y_train.shape}")
	
	def train_safe_classifier(X, y, terms, test_size=0.2, lr=0.001, epochs=10):
		X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=42)
		
		# Identify columns (GO terms) with at least 1 positive sample
		valid_cols = np.where(y_train.sum(axis=0) > 0)[0]
		if len(valid_cols) < y_train.shape[1]:
			removed_terms = [terms[i] for i in range(len(terms)) if i not in valid_cols]
			print(f"Removing {len(removed_terms)} GO terms with no positives in training: {removed_terms[:10]}{'...' if len(removed_terms) > 10 else ''}")
		
		# Filter y to valid columns (GO terms)
		y_train_filtered = y_train[:, valid_cols]
		y_val_filtered = y_val[:, valid_cols]
		filtered_terms = [terms[i] for i in valid_cols]
		
		# Train multi-output neural network model
		# base_clf = MLPClassifier(random_state=42, max_iter=100, solver='adam', hidden_layer_sizes=(100,100), activation='relu', verbose=True)
		# clf = MultiOutputClassifier(base_clf)
		# clf.fit(X_train, y_train_filtered)
		
		# y_pred = clf.predict(X_val)
		# f1 = f1_score(y_val_filtered, y_pred, average='micro')
		# precision = precision_score(y_val_filtered, y_pred, average='micro')
		# recall = recall_score(y_val_filtered, y_pred, average='micro')
		
		# print("=== VALIDATION RESULTS ===")
		# print(f"Micro F1-score: {f1:.4f}")
		# print(f"Micro Precision: {precision:.4f}")
		# print(f"Micro Recall: {recall:.4f}")
		# print(f"Predicted labels: {y_pred.sum()} / {y_val_filtered.sum()} actual")
		
		# return clf, (f1, precision, recall), filtered_terms

		X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
		y_train_tensor = torch.tensor(y_train_filtered, dtype=torch.float32).to(device)
		X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
		y_val_tensor = torch.tensor(y_val_filtered, dtype=torch.float32).to(device)
		
		train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
		train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
		
		# Initialize the model
		input_size = X_train.shape[1]
		output_size = y_train_filtered.shape[1]
		model = MultiOutputNN(input_size, output_size)
		model = model.to(device)
		
		# Define loss function and optimizer
		criterion = nn.BCELoss()  # Use BCE loss for multi-label classification
		optimizer = optim.Adam(model.parameters(), lr=lr)

		# Training loop
		for epoch in range(epochs):
			model.train()
			epoch_loss = 0.0
			with tqdm(train_loader, desc=f"Epoch [{epoch+1}/{epochs}]", unit="batch", ncols=100) as t:
				for batch_idx, (X_batch, y_batch) in enumerate(t):
					optimizer.zero_grad()

					# Forward pass
					y_pred_train = model(X_batch)

					# Compute loss
					loss = criterion(y_pred_train, y_batch)
					epoch_loss += loss.item()

					# Backward pass and optimization
					loss.backward()
					optimizer.step()

					# Update the tqdm progress bar with current loss
					t.set_postfix(loss=epoch_loss / (batch_idx + 1))
			
			avg_loss = epoch_loss / len(train_loader)
			print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.8f}")

		model.eval()
		with torch.no_grad():
			y_pred_val = model(X_val_tensor)
			# print(y_pred_val)
			y_pred_val = (y_pred_val > 0.01).float()  # Convert logits to binary predictions (0 or 1)

			f1 = f1_score(y_val_tensor.cpu(), y_pred_val.cpu(), average='micro')
			precision = precision_score(y_val_tensor.cpu(), y_pred_val.cpu(), average='micro')
			recall = recall_score(y_val_tensor.cpu(), y_pred_val.cpu(), average='micro')

			print("=== VALIDATION RESULTS ===")
			print(f"Micro F1-score: {f1:.4f}")
			print(f"Micro Precision: {precision:.4f}")
			print(f"Micro Recall: {recall:.4f}")
			print(f"Predicted labels: {y_pred_val.sum()} / {y_val_tensor.sum()} actual")
		
		return model, (f1, precision, recall), filtered_terms
		
	# Train and evaluate
	classifier, metrics, filtered_terms = train_safe_classifier(X_train, y_train.toarray(), terms)
		
	return classifier, metrics, filtered_terms, X_train, y_train

classifier, metrics, filtered_terms, X_train_after, y_train_after = training(
	filtered_train_df, test_df, term_counts,
	min_occurrence=1,
	batch_size=128
)


Training examples: 82404
Unique proteins: 82404
GO terms: 26125
Creating ProteinDataset and extracting embeddings...
cache file:  generated/esm_embeddings_train.pkl
Loading cached embeddings from generated/esm_embeddings_train.pkl
Feature matrix: X=(82404, 640), Label matrix: y=(82404, 22561)
Removing 16234 GO terms with no positives in training: ['GO:0006325', 'GO:0000165', 'GO:0070588', 'GO:0051402', 'GO:0031647', 'GO:0061844', 'GO:0007611', 'GO:0071346', 'GO:0006094', 'GO:0007052']...


: 

In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

def predict(classifier, test_df, term_list, batch_size=1000, esm_batch_size=8, output_file="predictions.csv"):
	"""
	Predict GO terms for a large set of proteins.
	Saves predictions incrementally to CSV to avoid memory issues.
			
	Parameters:
		classifier : trained multi-output classifier
		test_df   : DataFrame with 'entry_id' and 'sequence'
		term_list  : list of terms used during training
		batch_size : number of proteins per batch
		esm_batch_size : batch size for ESM embeddings
		output_file: path to save predictions CSV
	"""

	classifier.eval()
	# Initialize CSV with headers
	columns = ['entry_id', 'GO_term', 'probability']
	pd.DataFrame(columns=columns).to_csv(output_file, index=False)
		
	# Process in batches
	num_batches = (len(test_df) - 1) // batch_size + 1
	for i in range(num_batches):
		batch_df = test_df.iloc[i*batch_size : (i+1)*batch_size]
		batch_dataset = ProteinDataset(batch_df)
		
		print(f"Processing batch {i+1}/{num_batches} ({len(batch_df)} proteins)...")
		
		# Extract embeddings
		cache_file = f"generated/esm_embeddings_test_{i}.pkl"
		batch_embeddings = get_esm_embeddings(batch_dataset, batch_size=esm_batch_size, show_progress=True, cache_file=cache_file)
		batch_proteins = list(batch_embeddings.keys())
		X_batch = np.array([batch_embeddings[pid] for pid in batch_proteins])
		
		# Predictions
		X_batch_tensor = torch.tensor(X_batch, dtype=torch.float32).to(device)
		y_pred_batch = classifier(X_batch_tensor)
		y_pred_batch = y_pred_batch.cpu()
		# Prepare rows to save
		rows = []
		for j, pid in enumerate(batch_proteins):
			prob_list = []
			for k, term in enumerate(term_list):
				prob = y_pred_batch[j, k].item()

				if (prob > 1e-4):
					prob_list.append((term, prob))
			sorted_prob_list = sorted(prob_list, key=lambda x: -x[1])[:20]
			for term, prob in sorted_prob_list:
				rows.append({'entry_id': pid, 'GO_term': term, 'probability': prob})

		# Append to CSV
		pd.DataFrame(rows).to_csv(output_file, mode='a', header=False, index=False)
		
	print(f"Predictions saved to {output_file}")
	return output_file

# Predict on all proteins (~225k)
predict(
    classifier=classifier,
    test_df=test_df,
    term_list=filtered_terms,
    batch_size=1000,      # process 1k proteins per loop
    esm_batch_size=1,     # ESM embeddings batch size for GPU
    output_file="protein_predictions1.csv"
)

Processing batch 1/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_0.pkl
Loading cached embeddings from generated/esm_embeddings_test_0.pkl
Processing batch 2/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_1.pkl
Loading cached embeddings from generated/esm_embeddings_test_1.pkl
Processing batch 3/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_2.pkl
Loading cached embeddings from generated/esm_embeddings_test_2.pkl
Processing batch 4/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_3.pkl
Loading cached embeddings from generated/esm_embeddings_test_3.pkl
Processing batch 5/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_4.pkl
Loading cached embeddings from generated/esm_embeddings_test_4.pkl
Processing batch 6/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_5.pkl
Loading cached embeddings from generated/esm_embeddings_test_5.pkl
Processing batch 7/225 (1000 proteins)...
cache file

Extracting embeddings: 100%|██████████| 1000/1000 [03:48<00:00,  4.38it/s]


Embeddings cached to generated/esm_embeddings_test_30.pkl
Processing batch 32/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_31.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:30<00:00,  4.75it/s]


Embeddings cached to generated/esm_embeddings_test_31.pkl
Processing batch 33/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_32.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s]


Embeddings cached to generated/esm_embeddings_test_32.pkl
Processing batch 34/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_33.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:29<00:00,  4.77it/s]


Embeddings cached to generated/esm_embeddings_test_33.pkl
Processing batch 35/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_34.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:40<00:00,  6.24it/s]


Embeddings cached to generated/esm_embeddings_test_34.pkl
Processing batch 36/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_35.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:39<00:00,  6.25it/s]


Embeddings cached to generated/esm_embeddings_test_35.pkl
Processing batch 37/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_36.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:15<00:00,  5.12it/s]


Embeddings cached to generated/esm_embeddings_test_36.pkl
Processing batch 38/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_37.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:17<00:00,  5.07it/s]


Embeddings cached to generated/esm_embeddings_test_37.pkl
Processing batch 39/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_38.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:13<00:00,  5.16it/s]


Embeddings cached to generated/esm_embeddings_test_38.pkl
Processing batch 40/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_39.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:35<00:00,  4.65it/s]


Embeddings cached to generated/esm_embeddings_test_39.pkl
Processing batch 41/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_40.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:40<00:00,  6.23it/s]


Embeddings cached to generated/esm_embeddings_test_40.pkl
Processing batch 42/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_41.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:35<00:00,  6.41it/s]


Embeddings cached to generated/esm_embeddings_test_41.pkl
Processing batch 43/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_42.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [04:08<00:00,  4.02it/s]


Embeddings cached to generated/esm_embeddings_test_42.pkl
Processing batch 44/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_43.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:26<00:00,  4.85it/s]


Embeddings cached to generated/esm_embeddings_test_43.pkl
Processing batch 45/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_44.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:26<00:00,  6.84it/s]


Embeddings cached to generated/esm_embeddings_test_44.pkl
Processing batch 46/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_45.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:01<00:00,  5.51it/s]


Embeddings cached to generated/esm_embeddings_test_45.pkl
Processing batch 47/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_46.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:41<00:00,  6.17it/s]


Embeddings cached to generated/esm_embeddings_test_46.pkl
Processing batch 48/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_47.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:29<00:00,  4.77it/s]


Embeddings cached to generated/esm_embeddings_test_47.pkl
Processing batch 49/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_48.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:12<00:00,  5.20it/s]


Embeddings cached to generated/esm_embeddings_test_48.pkl
Processing batch 50/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_49.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:53<00:00,  4.29it/s]


Embeddings cached to generated/esm_embeddings_test_49.pkl
Processing batch 51/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_50.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:13<00:00,  5.18it/s]


Embeddings cached to generated/esm_embeddings_test_50.pkl
Processing batch 52/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_51.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:56<00:00,  4.22it/s]


Embeddings cached to generated/esm_embeddings_test_51.pkl
Processing batch 53/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_52.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:17<00:00,  5.06it/s] 


Embeddings cached to generated/esm_embeddings_test_52.pkl
Processing batch 54/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_53.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:06<00:00,  5.36it/s]


Embeddings cached to generated/esm_embeddings_test_53.pkl
Processing batch 55/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_54.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:43<00:00,  6.11it/s]


Embeddings cached to generated/esm_embeddings_test_54.pkl
Processing batch 56/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_55.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:53<00:00,  5.76it/s]


Embeddings cached to generated/esm_embeddings_test_55.pkl
Processing batch 57/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_56.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:38<00:00,  6.30it/s]


Embeddings cached to generated/esm_embeddings_test_56.pkl
Processing batch 58/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_57.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:43<00:00,  6.10it/s]


Embeddings cached to generated/esm_embeddings_test_57.pkl
Processing batch 59/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_58.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:51<00:00,  5.84it/s]


Embeddings cached to generated/esm_embeddings_test_58.pkl
Processing batch 60/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_59.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:59<00:00,  5.58it/s]


Embeddings cached to generated/esm_embeddings_test_59.pkl
Processing batch 61/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_60.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:57<00:00,  5.63it/s]


Embeddings cached to generated/esm_embeddings_test_60.pkl
Processing batch 62/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_61.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:39<00:00,  6.28it/s]


Embeddings cached to generated/esm_embeddings_test_61.pkl
Processing batch 63/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_62.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:11<00:00,  5.22it/s]


Embeddings cached to generated/esm_embeddings_test_62.pkl
Processing batch 64/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_63.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:49<00:00,  5.90it/s]


Embeddings cached to generated/esm_embeddings_test_63.pkl
Processing batch 65/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_64.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:40<00:00,  6.23it/s]


Embeddings cached to generated/esm_embeddings_test_64.pkl
Processing batch 66/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_65.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:51<00:00,  5.84it/s]


Embeddings cached to generated/esm_embeddings_test_65.pkl
Processing batch 67/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_66.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:34<00:00,  6.49it/s]


Embeddings cached to generated/esm_embeddings_test_66.pkl
Processing batch 68/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_67.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:17<00:00,  7.26it/s]


Embeddings cached to generated/esm_embeddings_test_67.pkl
Processing batch 69/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_68.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:37<00:00,  6.36it/s]


Embeddings cached to generated/esm_embeddings_test_68.pkl
Processing batch 70/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_69.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:29<00:00,  6.69it/s]


Embeddings cached to generated/esm_embeddings_test_69.pkl
Processing batch 71/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_70.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:32<00:00,  6.54it/s]


Embeddings cached to generated/esm_embeddings_test_70.pkl
Processing batch 72/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_71.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:15<00:00,  7.37it/s]


Embeddings cached to generated/esm_embeddings_test_71.pkl
Processing batch 73/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_72.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:24<00:00,  6.94it/s]


Embeddings cached to generated/esm_embeddings_test_72.pkl
Processing batch 74/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_73.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:14<00:00,  5.14it/s]


Embeddings cached to generated/esm_embeddings_test_73.pkl
Processing batch 75/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_74.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:11<00:00,  7.63it/s]


Embeddings cached to generated/esm_embeddings_test_74.pkl
Processing batch 76/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_75.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:41<00:00,  6.20it/s]


Embeddings cached to generated/esm_embeddings_test_75.pkl
Processing batch 77/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_76.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:47<00:00,  5.96it/s]


Embeddings cached to generated/esm_embeddings_test_76.pkl
Processing batch 78/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_77.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:33<00:00,  6.52it/s]


Embeddings cached to generated/esm_embeddings_test_77.pkl
Processing batch 79/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_78.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:05<00:00,  7.99it/s]


Embeddings cached to generated/esm_embeddings_test_78.pkl
Processing batch 80/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_79.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:59<00:00,  5.57it/s]


Embeddings cached to generated/esm_embeddings_test_79.pkl
Processing batch 81/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_80.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:08<00:00,  5.30it/s]


Embeddings cached to generated/esm_embeddings_test_80.pkl
Processing batch 82/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_81.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:14<00:00,  7.43it/s]


Embeddings cached to generated/esm_embeddings_test_81.pkl
Processing batch 83/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_82.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:15<00:00,  7.38it/s]


Embeddings cached to generated/esm_embeddings_test_82.pkl
Processing batch 84/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_83.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:40<00:00,  6.21it/s]


Embeddings cached to generated/esm_embeddings_test_83.pkl
Processing batch 85/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_84.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:57<00:00,  5.63it/s]


Embeddings cached to generated/esm_embeddings_test_84.pkl
Processing batch 86/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_85.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:48<00:00,  9.19it/s]


Embeddings cached to generated/esm_embeddings_test_85.pkl
Processing batch 87/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_86.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:40<00:00,  6.24it/s]


Embeddings cached to generated/esm_embeddings_test_86.pkl
Processing batch 88/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_87.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:58<00:00,  5.59it/s]


Embeddings cached to generated/esm_embeddings_test_87.pkl
Processing batch 89/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_88.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:38<00:00,  6.32it/s]


Embeddings cached to generated/esm_embeddings_test_88.pkl
Processing batch 90/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_89.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:25<00:00,  6.90it/s]


Embeddings cached to generated/esm_embeddings_test_89.pkl
Processing batch 91/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_90.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:59<00:00,  5.58it/s]


Embeddings cached to generated/esm_embeddings_test_90.pkl
Processing batch 92/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_91.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:22<00:00,  7.02it/s]


Embeddings cached to generated/esm_embeddings_test_91.pkl
Processing batch 93/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_92.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:12<00:00,  7.57it/s]


Embeddings cached to generated/esm_embeddings_test_92.pkl
Processing batch 94/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_93.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:31<00:00,  6.61it/s]


Embeddings cached to generated/esm_embeddings_test_93.pkl
Processing batch 95/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_94.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:18<00:00,  7.19it/s]


Embeddings cached to generated/esm_embeddings_test_94.pkl
Processing batch 96/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_95.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:46<00:00,  6.01it/s]


Embeddings cached to generated/esm_embeddings_test_95.pkl
Processing batch 97/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_96.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:27<00:00,  6.77it/s]


Embeddings cached to generated/esm_embeddings_test_96.pkl
Processing batch 98/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_97.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:19<00:00,  7.19it/s]


Embeddings cached to generated/esm_embeddings_test_97.pkl
Processing batch 99/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_98.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:30<00:00,  6.67it/s]


Embeddings cached to generated/esm_embeddings_test_98.pkl
Processing batch 100/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_99.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:16<00:00,  7.32it/s]


Embeddings cached to generated/esm_embeddings_test_99.pkl
Processing batch 101/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_100.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:09<00:00,  7.69it/s]


Embeddings cached to generated/esm_embeddings_test_100.pkl
Processing batch 102/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_101.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:55<00:00,  8.64it/s]


Embeddings cached to generated/esm_embeddings_test_101.pkl
Processing batch 103/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_102.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:07<00:00,  5.33it/s]


Embeddings cached to generated/esm_embeddings_test_102.pkl
Processing batch 104/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_103.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:32<00:00,  4.70it/s]


Embeddings cached to generated/esm_embeddings_test_103.pkl
Processing batch 105/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_104.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:28<00:00,  4.79it/s]


Embeddings cached to generated/esm_embeddings_test_104.pkl
Processing batch 106/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_105.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:34<00:00,  6.48it/s]


Embeddings cached to generated/esm_embeddings_test_105.pkl
Processing batch 107/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_106.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:15<00:00,  5.11it/s]


Embeddings cached to generated/esm_embeddings_test_106.pkl
Processing batch 108/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_107.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [03:03<00:00,  5.45it/s]


Embeddings cached to generated/esm_embeddings_test_107.pkl
Processing batch 109/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_108.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:24<00:00,  6.94it/s]


Embeddings cached to generated/esm_embeddings_test_108.pkl
Processing batch 110/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_109.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:09<00:00,  7.74it/s]


Embeddings cached to generated/esm_embeddings_test_109.pkl
Processing batch 111/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_110.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:51<00:00,  5.84it/s]


Embeddings cached to generated/esm_embeddings_test_110.pkl
Processing batch 112/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_111.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:22<00:00,  7.01it/s]


Embeddings cached to generated/esm_embeddings_test_111.pkl
Processing batch 113/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_112.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:24<00:00,  6.94it/s]


Embeddings cached to generated/esm_embeddings_test_112.pkl
Processing batch 114/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_113.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:44<00:00,  6.09it/s]


Embeddings cached to generated/esm_embeddings_test_113.pkl
Processing batch 115/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_114.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:29<00:00,  6.67it/s]


Embeddings cached to generated/esm_embeddings_test_114.pkl
Processing batch 116/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_115.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:46<00:00,  9.40it/s]


Embeddings cached to generated/esm_embeddings_test_115.pkl
Processing batch 117/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_116.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:34<00:00, 10.54it/s]


Embeddings cached to generated/esm_embeddings_test_116.pkl
Processing batch 118/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_117.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:13<00:00, 13.65it/s]


Embeddings cached to generated/esm_embeddings_test_117.pkl
Processing batch 119/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_118.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:08<00:00,  7.76it/s]


Embeddings cached to generated/esm_embeddings_test_118.pkl
Processing batch 120/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_119.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:14<00:00,  7.43it/s]


Embeddings cached to generated/esm_embeddings_test_119.pkl
Processing batch 121/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_120.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:10<00:00,  7.66it/s]


Embeddings cached to generated/esm_embeddings_test_120.pkl
Processing batch 122/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_121.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:45<00:00,  9.46it/s]


Embeddings cached to generated/esm_embeddings_test_121.pkl
Processing batch 123/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_122.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:26<00:00,  6.82it/s]


Embeddings cached to generated/esm_embeddings_test_122.pkl
Processing batch 124/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_123.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [02:07<00:00,  7.82it/s]


Embeddings cached to generated/esm_embeddings_test_123.pkl
Processing batch 125/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_124.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:47<00:00,  9.28it/s]


Embeddings cached to generated/esm_embeddings_test_124.pkl
Processing batch 126/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_125.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:00<00:00, 16.65it/s]


Embeddings cached to generated/esm_embeddings_test_125.pkl
Processing batch 127/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_126.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:02<00:00, 15.94it/s]


Embeddings cached to generated/esm_embeddings_test_126.pkl
Processing batch 128/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_127.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:24<00:00, 11.82it/s]


Embeddings cached to generated/esm_embeddings_test_127.pkl
Processing batch 129/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_128.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]


Embeddings cached to generated/esm_embeddings_test_128.pkl
Processing batch 130/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_129.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:45<00:00,  9.46it/s]


Embeddings cached to generated/esm_embeddings_test_129.pkl
Processing batch 131/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_130.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:22<00:00, 12.07it/s]


Embeddings cached to generated/esm_embeddings_test_130.pkl
Processing batch 132/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_131.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:32<00:00, 10.84it/s]


Embeddings cached to generated/esm_embeddings_test_131.pkl
Processing batch 133/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_132.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:45<00:00,  9.44it/s]


Embeddings cached to generated/esm_embeddings_test_132.pkl
Processing batch 134/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_133.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:35<00:00, 10.45it/s]


Embeddings cached to generated/esm_embeddings_test_133.pkl
Processing batch 135/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_134.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:15<00:00, 13.33it/s]


Embeddings cached to generated/esm_embeddings_test_134.pkl
Processing batch 136/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_135.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:07<00:00, 14.87it/s]


Embeddings cached to generated/esm_embeddings_test_135.pkl
Processing batch 137/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_136.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:53<00:00, 18.60it/s]


Embeddings cached to generated/esm_embeddings_test_136.pkl
Processing batch 138/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_137.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:39<00:00, 10.00it/s]


Embeddings cached to generated/esm_embeddings_test_137.pkl
Processing batch 139/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_138.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:43<00:00,  9.68it/s]


Embeddings cached to generated/esm_embeddings_test_138.pkl
Processing batch 140/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_139.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:38<00:00, 10.16it/s]


Embeddings cached to generated/esm_embeddings_test_139.pkl
Processing batch 141/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_140.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:25<00:00, 11.70it/s]


Embeddings cached to generated/esm_embeddings_test_140.pkl
Processing batch 142/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_141.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:44<00:00,  9.56it/s]


Embeddings cached to generated/esm_embeddings_test_141.pkl
Processing batch 143/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_142.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:38<00:00, 10.11it/s]


Embeddings cached to generated/esm_embeddings_test_142.pkl
Processing batch 144/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_143.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:34<00:00, 10.61it/s]


Embeddings cached to generated/esm_embeddings_test_143.pkl
Processing batch 145/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_144.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:06<00:00, 15.12it/s]


Embeddings cached to generated/esm_embeddings_test_144.pkl
Processing batch 146/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_145.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:07<00:00, 14.91it/s]


Embeddings cached to generated/esm_embeddings_test_145.pkl
Processing batch 147/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_146.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:18<00:00, 12.68it/s]


Embeddings cached to generated/esm_embeddings_test_146.pkl
Processing batch 148/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_147.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:38<00:00, 10.16it/s]


Embeddings cached to generated/esm_embeddings_test_147.pkl
Processing batch 149/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_148.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:48<00:00,  9.21it/s]


Embeddings cached to generated/esm_embeddings_test_148.pkl
Processing batch 150/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_149.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:26<00:00, 11.55it/s]


Embeddings cached to generated/esm_embeddings_test_149.pkl
Processing batch 151/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_150.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:32<00:00, 10.86it/s]


Embeddings cached to generated/esm_embeddings_test_150.pkl
Processing batch 152/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_151.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:52<00:00,  8.87it/s]


Embeddings cached to generated/esm_embeddings_test_151.pkl
Processing batch 153/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_152.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:40<00:00,  9.92it/s]


Embeddings cached to generated/esm_embeddings_test_152.pkl
Processing batch 154/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_153.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:16<00:00, 13.12it/s]


Embeddings cached to generated/esm_embeddings_test_153.pkl
Processing batch 155/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_154.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:09<00:00, 14.47it/s]


Embeddings cached to generated/esm_embeddings_test_154.pkl
Processing batch 156/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_155.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:57<00:00, 17.34it/s]


Embeddings cached to generated/esm_embeddings_test_155.pkl
Processing batch 157/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_156.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:38<00:00, 10.15it/s]


Embeddings cached to generated/esm_embeddings_test_156.pkl
Processing batch 158/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_157.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:45<00:00,  9.51it/s]


Embeddings cached to generated/esm_embeddings_test_157.pkl
Processing batch 159/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_158.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:40<00:00,  9.90it/s]


Embeddings cached to generated/esm_embeddings_test_158.pkl
Processing batch 160/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_159.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:26<00:00, 11.57it/s]


Embeddings cached to generated/esm_embeddings_test_159.pkl
Processing batch 161/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_160.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:18<00:00, 12.72it/s]


Embeddings cached to generated/esm_embeddings_test_160.pkl
Processing batch 162/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_161.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:17<00:00, 12.88it/s]


Embeddings cached to generated/esm_embeddings_test_161.pkl
Processing batch 163/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_162.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:08<00:00, 14.65it/s]


Embeddings cached to generated/esm_embeddings_test_162.pkl
Processing batch 164/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_163.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:30<00:00, 11.08it/s]


Embeddings cached to generated/esm_embeddings_test_163.pkl
Processing batch 165/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_164.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:19<00:00, 12.55it/s]


Embeddings cached to generated/esm_embeddings_test_164.pkl
Processing batch 166/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_165.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:14<00:00, 13.48it/s]


Embeddings cached to generated/esm_embeddings_test_165.pkl
Processing batch 167/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_166.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:30<00:00, 11.02it/s]


Embeddings cached to generated/esm_embeddings_test_166.pkl
Processing batch 168/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_167.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:17<00:00, 12.98it/s]


Embeddings cached to generated/esm_embeddings_test_167.pkl
Processing batch 169/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_168.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:24<00:00, 11.84it/s]


Embeddings cached to generated/esm_embeddings_test_168.pkl
Processing batch 170/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_169.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:27<00:00, 11.39it/s]


Embeddings cached to generated/esm_embeddings_test_169.pkl
Processing batch 171/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_170.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:12<00:00, 13.72it/s]


Embeddings cached to generated/esm_embeddings_test_170.pkl
Processing batch 172/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_171.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:22<00:00, 12.09it/s]


Embeddings cached to generated/esm_embeddings_test_171.pkl
Processing batch 173/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_172.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:29<00:00, 11.14it/s]


Embeddings cached to generated/esm_embeddings_test_172.pkl
Processing batch 174/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_173.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:03<00:00, 15.63it/s]


Embeddings cached to generated/esm_embeddings_test_173.pkl
Processing batch 175/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_174.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:26<00:00, 11.52it/s]


Embeddings cached to generated/esm_embeddings_test_174.pkl
Processing batch 176/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_175.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:14<00:00, 13.50it/s]


Embeddings cached to generated/esm_embeddings_test_175.pkl
Processing batch 177/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_176.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:06<00:00, 15.04it/s]


Embeddings cached to generated/esm_embeddings_test_176.pkl
Processing batch 178/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_177.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:04<00:00, 15.51it/s]


Embeddings cached to generated/esm_embeddings_test_177.pkl
Processing batch 179/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_178.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:53<00:00, 18.84it/s]


Embeddings cached to generated/esm_embeddings_test_178.pkl
Processing batch 180/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_179.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:37<00:00, 26.96it/s]


Embeddings cached to generated/esm_embeddings_test_179.pkl
Processing batch 181/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_180.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:03<00:00, 15.78it/s]


Embeddings cached to generated/esm_embeddings_test_180.pkl
Processing batch 182/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_181.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:19<00:00, 12.54it/s]


Embeddings cached to generated/esm_embeddings_test_181.pkl
Processing batch 183/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_182.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:09<00:00, 14.45it/s]


Embeddings cached to generated/esm_embeddings_test_182.pkl
Processing batch 184/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_183.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:05<00:00, 15.17it/s]


Embeddings cached to generated/esm_embeddings_test_183.pkl
Processing batch 185/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_184.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:05<00:00, 15.31it/s]


Embeddings cached to generated/esm_embeddings_test_184.pkl
Processing batch 186/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_185.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:04<00:00, 15.52it/s]


Embeddings cached to generated/esm_embeddings_test_185.pkl
Processing batch 187/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_186.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:47<00:00, 21.16it/s]


Embeddings cached to generated/esm_embeddings_test_186.pkl
Processing batch 188/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_187.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:43<00:00, 23.16it/s]


Embeddings cached to generated/esm_embeddings_test_187.pkl
Processing batch 189/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_188.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:06<00:00, 15.08it/s]


Embeddings cached to generated/esm_embeddings_test_188.pkl
Processing batch 190/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_189.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:20<00:00, 12.45it/s]


Embeddings cached to generated/esm_embeddings_test_189.pkl
Processing batch 191/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_190.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:05<00:00, 15.19it/s]


Embeddings cached to generated/esm_embeddings_test_190.pkl
Processing batch 192/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_191.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:06<00:00, 15.08it/s]


Embeddings cached to generated/esm_embeddings_test_191.pkl
Processing batch 193/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_192.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:05<00:00, 15.20it/s]


Embeddings cached to generated/esm_embeddings_test_192.pkl
Processing batch 194/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_193.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:54<00:00, 18.48it/s]


Embeddings cached to generated/esm_embeddings_test_193.pkl
Processing batch 195/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_194.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:40<00:00, 24.70it/s]


Embeddings cached to generated/esm_embeddings_test_194.pkl
Processing batch 196/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_195.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:58<00:00, 17.17it/s]


Embeddings cached to generated/esm_embeddings_test_195.pkl
Processing batch 197/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_196.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:10<00:00, 14.11it/s]


Embeddings cached to generated/esm_embeddings_test_196.pkl
Processing batch 198/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_197.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:12<00:00, 13.75it/s]


Embeddings cached to generated/esm_embeddings_test_197.pkl
Processing batch 199/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_198.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:06<00:00, 15.14it/s]


Embeddings cached to generated/esm_embeddings_test_198.pkl
Processing batch 200/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_199.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:08<00:00, 14.53it/s]


Embeddings cached to generated/esm_embeddings_test_199.pkl
Processing batch 201/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_200.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:06<00:00, 14.97it/s]


Embeddings cached to generated/esm_embeddings_test_200.pkl
Processing batch 202/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_201.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:48<00:00, 20.66it/s]


Embeddings cached to generated/esm_embeddings_test_201.pkl
Processing batch 203/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_202.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:39<00:00, 25.35it/s]


Embeddings cached to generated/esm_embeddings_test_202.pkl
Processing batch 204/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_203.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:03<00:00, 15.73it/s]


Embeddings cached to generated/esm_embeddings_test_203.pkl
Processing batch 205/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_204.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:16<00:00, 13.09it/s]


Embeddings cached to generated/esm_embeddings_test_204.pkl
Processing batch 206/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_205.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:07<00:00, 14.83it/s]


Embeddings cached to generated/esm_embeddings_test_205.pkl
Processing batch 207/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_206.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:03<00:00, 15.73it/s]


Embeddings cached to generated/esm_embeddings_test_206.pkl
Processing batch 208/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_207.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:05<00:00, 15.21it/s]


Embeddings cached to generated/esm_embeddings_test_207.pkl
Processing batch 209/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_208.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:01<00:00, 16.26it/s]


Embeddings cached to generated/esm_embeddings_test_208.pkl
Processing batch 210/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_209.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:46<00:00, 21.45it/s]


Embeddings cached to generated/esm_embeddings_test_209.pkl
Processing batch 211/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_210.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:48<00:00, 20.83it/s]


Embeddings cached to generated/esm_embeddings_test_210.pkl
Processing batch 212/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_211.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:04<00:00, 15.56it/s]


Embeddings cached to generated/esm_embeddings_test_211.pkl
Processing batch 213/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_212.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:19<00:00, 12.52it/s]


Embeddings cached to generated/esm_embeddings_test_212.pkl
Processing batch 214/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_213.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:00<00:00, 16.65it/s]


Embeddings cached to generated/esm_embeddings_test_213.pkl
Processing batch 215/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_214.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:42<00:00, 23.34it/s]


Embeddings cached to generated/esm_embeddings_test_214.pkl
Processing batch 216/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_215.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:51<00:00, 19.53it/s]


Embeddings cached to generated/esm_embeddings_test_215.pkl
Processing batch 217/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_216.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:52<00:00, 19.17it/s]


Embeddings cached to generated/esm_embeddings_test_216.pkl
Processing batch 218/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_217.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:49<00:00, 20.11it/s]


Embeddings cached to generated/esm_embeddings_test_217.pkl
Processing batch 219/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_218.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:49<00:00, 20.20it/s]


Embeddings cached to generated/esm_embeddings_test_218.pkl
Processing batch 220/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_219.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:53<00:00, 18.65it/s]


Embeddings cached to generated/esm_embeddings_test_219.pkl
Processing batch 221/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_220.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:49<00:00, 20.17it/s]


Embeddings cached to generated/esm_embeddings_test_220.pkl
Processing batch 222/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_221.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:47<00:00, 21.10it/s]


Embeddings cached to generated/esm_embeddings_test_221.pkl
Processing batch 223/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_222.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:45<00:00, 21.99it/s]


Embeddings cached to generated/esm_embeddings_test_222.pkl
Processing batch 224/225 (1000 proteins)...
cache file:  generated/esm_embeddings_test_223.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [00:47<00:00, 21.20it/s]


Embeddings cached to generated/esm_embeddings_test_223.pkl
Processing batch 225/225 (309 proteins)...
cache file:  generated/esm_embeddings_test_224.pkl
Extracting embeddings...


Extracting embeddings: 100%|██████████| 309/309 [00:18<00:00, 17.13it/s]


Embeddings cached to generated/esm_embeddings_test_224.pkl
Predictions saved to protein_predictions1.csv


'protein_predictions1.csv'

In [None]:
import pandas as pd
import numpy as np

def prepare_cafa_submission(pred_csv, go_obo_file, output_file="submission.tsv", max_terms=1500):
    """
    - Loads raw prediction CSV (entry_id, GO_term, probability)
    - Propagates predictions up the GO hierarchy
    - Limits to max_terms per protein
    - Formats final CAFA submission file
    """
    print("\n=== Preparing CAFA Submission File ===")

    print("Loading predictions...")
    df = pd.read_csv(pred_csv)

    print("Loading GO ontology...")
    parents = {}

    # Parse go-basic.obo to build term → parents dictionary
    with open(go_obo_file, 'r') as f:
        current_term = None
        for line in f:
            line = line.strip()
            if line.startswith("id: GO:"):
                current_term = line.split("id: ")[1]
                parents[current_term] = []
            if line.startswith("is_a: GO:") and current_term:
                parent_term = line.split("is_a: ")[1].split()[0]
                parents[current_term].append(parent_term)

    print("Propagating predictions to parent terms...")
    propagated_rows = []

    for protein, group in df.groupby("entry_id"):
        term_scores = dict(zip(group.GO_term, group.probability))

        # BFS upward propagation
        queue = list(term_scores.keys())
        visited = set()

        while queue:
            term = queue.pop()
            if term in visited: 
                continue
            visited.add(term)

            if term in parents:
                for p in parents[term]:
                    new_score = term_scores.get(term, 0)
                    term_scores[p] = max(term_scores.get(p, 0), new_score)
                    queue.append(p)

        # Keep highest prob terms and limit to max_terms
        top_terms = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)[:max_terms]

        for term, score in top_terms:
            if score > 0:  # CAFA requirement: no zeroes
                propagated_rows.append([protein, term, round(score, 3)])

    print("Saving formatted submission file...")
    sub = pd.DataFrame(propagated_rows, columns=["ProteinID", "GO_ID", "Score"])
    sub.to_csv(output_file, sep="\t", index=False, header=False)

    print(f"\nSubmission ready: {output_file}")
    print(f"Total predictions: {len(sub)}")
    return sub


In [None]:
prepare_cafa_submission(
    pred_csv="protein_predictions1.csv",
    go_obo_file="data/Train/go-basic.obo",
    output_file="submission.tsv"
)



=== Preparing CAFA Submission File ===
Loading predictions...
Loading GO ontology...
Propagating predictions to parent terms...
Saving formatted submission file...

Submission ready: submission.tsv
Total predictions: 5232732


Unnamed: 0,ProteinID,GO_ID,Score
0,A0A017SE81,GO:0010842,0.003
1,A0A017SE81,GO:0008150,0.003
2,A0A017SE81,GO:0048646,0.003
3,A0A017SE81,GO:0032502,0.003
4,A0A017SE81,GO:0016706,0.001
...,...,...,...
5232727,X6R8R1,GO:0042592,0.002
5232728,X6R8R1,GO:0007417,0.000
5232729,X6R8R1,GO:0048731,0.000
5232730,X6R8R1,GO:0048856,0.000


In [None]:
# kaggle competitions submit -c cafa-6-protein-function-prediction -f submission.tsv -m "esm 150m + nn 100x100"