# Installing Facebook embeddings template

In [None]:
!pip install git+https://github.com/facebookresearch/esm.git

In [None]:
import torch
import esm

# Load 34 layer model
model, alphabet = esm.pretrained.esm1_t34_670M_UR50S()
model = model.cuda()

batch_converter = alphabet.get_batch_converter()


In [None]:
import random
from collections import Counter
from tqdm import tqdm

import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDRegressor

In [None]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

# PART 1: Generating prediction model

# 1.1 Preparing data for training

In [None]:
# Prepare data (two protein sequences)

FASTA_PATH='''/path/to/the/fasta ''' # Fasta to train

data=[]
ys = []
Xs = []
for header, sequence in esm.data.read_fasta(FASTA_PATH):
  data.append((header, sequence))
  body = (header.split(' '))[-1]
  ys.append(float(body))
print(ys)
print(data)

In [None]:
# checking for memory storage (refresh if necessary)
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

# 1.2 Building embeddings

In [None]:
sequence_embeddings = []
# build embeddings
for batch_seqs in batch(data,10):
    batch_labels, batch_strs, batch_tokens = batch_converter(batch_seqs)

    # Extract per-residue embeddings (on GPU)
    batch_tokens_cuda = batch_tokens.to(device="cuda", non_blocking=True)
    with torch.no_grad():
        results = model(batch_tokens_cuda, repr_layers=[34])
    token_embeddings = results["representations"][34]
    # Generate per-sequence embeddings via averaging
    # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
    for i, (_, seq) in enumerate(batch_seqs):
        sequence_embeddings.append(token_embeddings[i, 1:len(seq) + 1].mean(0))

In [None]:
print(len(sequence_embeddings[0]))

# 1.3 Creating Training set & Test set

In [None]:
# split training and test set
Xs=[t.cpu().data.numpy() for t in sequence_embeddings]
train_size = 0.8
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, ys, train_size=train_size, random_state=42)

# 1.4 Beginning of the training block

In [None]:
knn_grid = {
    'n_neighbors': [5, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : [15, 30],
    'p' : [1, 2],
}

svm_grid = {
    'C' : [0.1, 1.0, 10.0],
    'kernel' :['linear', 'poly', 'rbf', 'sigmoid'],
    'degree' : [3],
    'gamma': ['scale'],
}

rfr_grid = {
    'n_estimators' : [100],
    'criterion' : ['squared_error', 'absolute_error'],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split' : [2, 10],
    'min_samples_leaf': [1, 4]
}
lgr_grid = {

}

In [None]:
# Training Block!!!
cls_list = [KNeighborsRegressor, SVR, RandomForestRegressor]
param_grid_list = [knn_grid, svm_grid, rfr_grid]
result_list = []
grid_list = []
for cls_name, param_grid in zip(cls_list, param_grid_list):
    print(cls_name)
    grid = GridSearchCV(
        estimator = cls_name(),
        param_grid = param_grid,
        scoring = 'r2',
        verbose = 1,
        n_jobs = -1 # use all available cores
    )
    grid.fit(Xs_train, ys_train)
    result_list.append(pd.DataFrame.from_dict(grid.cv_results_))
    grid_list.append(grid)

# 1.5 Testing the trained model

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

# Assuming grid_list, Xs_test, and ys_test are already defined

for i, grid in enumerate(grid_list):
    print(grid.best_estimator_)
    print()

    # Predictions
    preds = grid.predict(Xs_test)

    # Calculate Spearman's correlation
    rho, p_value = scipy.stats.spearmanr(ys_test, preds)

    # Create a DataFrame and save to CSV
    df = pd.DataFrame({'Actual Kcat/Km': ys_test, 'Predicted Kcat/Km': preds})
    csv_filename = f'grid_element_{i}_kcat_km_data.csv'
    df.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Create scatter plot
    plt.figure(figsize=(8, 6))
    plt.scatter(ys_test, preds, alpha=0.7)
    plt.xlabel('Actual Kcat/Km')
    plt.ylabel('Predicted Kcat/Km')
    plt.grid(True)

    # Annotate with Spearman's rho
    plt.annotate(f'Spearman\'s rho = {rho:.2f}\nP-value = {p_value:.2e}',
                 xy=(0.05, 0.85), xycoords='axes fraction',
                 fontsize=12, bbox=dict(boxstyle="round,pad=0.3", edgecolor='black', facecolor='white'))

    # Save plot as an image
    img_filename = f'grid_element_{i}_kcat_km_plot.png'
    plt.savefig(img_filename)
    print(f'Plot saved to {img_filename}')
    plt.close()

    print('\n', '-' * 80, '\n')


In [None]:
for i, grid in enumerate(grid_list):
    print(grid.best_estimator_)
    print()
    preds = grid.predict(Xs_test)
    print(f'{scipy.stats.spearmanr(ys_test, preds)}')
    print('\n', '-' * 80, '\n')
    # Calculate Spearman's correlation
    rho, p_value = scipy.stats.spearmanr(ys_test, preds)

    # Create a DataFrame and save to CSV
    df = pd.DataFrame({'Actual Values': ys_test, 'Predicted Values': preds})
    csv_filename = f'grid_element_{i}_data.csv'
    df.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Create scatter plot
    plt.figure(figsize=(8, 6))
    plt.scatter(ys_test, preds, alpha=0.7)
    plt.title('Spearman Correlation between Actual and Predicted Values')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.grid(True)

    # Annotate with Spearman's rho
    plt.annotate(f'Spearman\'s rho = {rho:.2f}\nP-value = {p_value:.2e}',
              xy=(0.05, 0.75), xycoords='axes fraction',
              fontsize=12, bbox=dict(boxstyle="round,pad=0.3", edgecolor='black', facecolor='white'))

    plt.show()

In [None]:
import matplotlib.pyplot as plt
import scipy.stats


# Calculate Spearman's correlation
rho, p_value = scipy.stats.spearmanr(ys_test, preds)

# Create scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(ys_test, preds, alpha=0.7)
plt.title('Spearman Correlation between Actual and Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.grid(True)

# Annotate with Spearman's rho
plt.annotate(f'Spearman\'s rho = {rho:.2f}\nP-value = {p_value:.2e}',
              xy=(0.05, 0.95), xycoords='axes fraction',
              fontsize=12, bbox=dict(boxstyle="round,pad=0.3", edgecolor='black', facecolor='white'))

plt.show()


In [None]:
topredict=[('wt','MEPSSLELPADTVQRIAAELKCHPTDERVALHLDEEDKLRHFRECFYIPKIQDLPPVDLSLVNKDENAIYFLGNSLGLQPKMVKTYLEEELDKWAKIAAYGHEVGKRPWITGDESIVGLMKDIVGANEKEIALMNALTVNLHLLMLSFFKPTPKRYKILLEAKAFPSDHYAIESQLQLHGLNIEESMRMIKPREGEETLRIEDILEVIEKEGDSIAVILFSGVHFYTGQHFNIPAITKAGQAKGCYVGFDLAHAVGNVELYLHDWGVDFACWCSYKYLNAGAGGIAGAFIHEKHAHTIKPALVGWFGHELSTRFKMDNKLQLIPGVCGFRISNPPILLVCSLHASLEIFKQATMKALRKKSVLLTGYLEYLIKHNYGKDKAATKKPVVNIITPSHVEERGCQLTITFSVPNKDVFQELEKRGVVCDKRNPNGIRVAPVPLYNSFHDVYKFTNLLTSILDSAETKN'),('best_patent','MEPSSLELPADTVQRIAAELKCHPTDERVALHLDEEDKLRHFRECFYIPKIQDLPPVDLSLVNKDEDAIYFNGNSLGLQPKMVKTYLEEELDKWAKIAINGWFEGDSPWIHYDESIVGLMKDIVGANEKEIVLMNTLTVNLHLLMLSFFKPTPKRYKILLEAKAFPSDHYAIESQLQLHGLNIEESMRIIKPREGEETLRIEDILEVIEKEGDSIAVILFSGIHYYTGQHFNIPAITKAGQAKGCYVGFDLAHAVGNVELYLHDWGVDFACWCGYKYLNSSPGGIAGAFIHEKHAHTIKPALVGWFGHELSTRFKMDNKLQLIPGVCGFRCSTPPILLVCILHASLEIFKQATMKALRKKSVLLTGYLEYLIKHNYGKDKAATKKPVVNIITPSHVEERGCQLTLTFNVPNKDVFQELEKRGVVCDKRNPNGIRVAPVPLYNSFHDVYKFTNLLTSILDSAETKN'),('best_mut','MEPSSLELPADTVQRIAAELKCHPTDERVALHLDEEDKLRHFRECFYIPKIQDLPPVDLSLVNKDEDAIYFNGNSLGLQPKMVKTYREEELDKWAKIAINGWFEGDSPWIHYDESIVGLMKDIVGANEKEIVLWYTLTHMLHLLMLSFFKPTPKRYKILLYAKAFPSDHYAIESQLQLHGLNIEESMRIIKPREGEETLRIEDILEVIEKEGDSIAVITFSGIHYMTGQHFNIPAITKALQAKGCYVGFDQAHAVGNVELYLHDWGVDFACNCGYKYLNSSPGWIQGWFCHEKHAHTIKPALVGWFGHELSTRFKMDNKLQLIPGVCGFRCSTPNHWLVCILHAPLENFKQATMKALRKKSVLLTGYLEYLIKHNYGKDKAATKKPVVNIITPSHVEERGCQLTLTFNVPNKDVFQELEKRGVVCDKRNPNGIRVAPVPLYNSFHDVYKFTNLLTSILDSAETKN'),('worst_mut','MEPSSLELPADTVQRIAAELKCHPTDERVALHLDEEDKLRHFRECFYIPKIQDLPPVDLSLVNKDEDAIYFNGNSLGLQPKMVKTYYEEELDKWAKIAINGWFEGDSPWIHYDESIVGLMKDIVGANEKEIVLYFTLTDQLHLLMLSFFKPTPKRYKILLNAKAFPSDHYAIESQLQLHGLNIEESMRIIKPREGEETLRIEDILEVIEKEGDSIAVIMFSGIHYETGQHFNIPAITKAMQAKGCYVGFDPAHAVGNVELYLHDWGVDFACVCGYKYLNSSPGIINGRFDHEKHAHTIKPALVGWFGHELSTRFKMDNKLQLIPGVCGFRCSTPKRKLVCILHAHLELFKQATMKALRKKSVLLTGYLEYLIKHNYGKDKAATKKPVVNIITPSHVEERGCQLTLTFNVPNKDVFQELEKRGVVCDKRNPNGIRVAPVPLYNSFHDVYKFTNLLTSILDSAETKN'),('var_93','MEPSPLELPADTVQRIASELRCHPTDERVALRLDEEDELRHFREYFYIPKMQDLPPIDLSLVNKDENAIYFLGNSLGLQPKMVKTYLEEELDKWAKMGAYGHEVGKRPWITGDETIVGLMTDIVGANEKEIALMNGLTVNLHLLLLSFFKPTPKRYKILLEAKAFPSDHYAIESQLQLHGLNVEKSMRIIKPREGEETLRTEDILEVIEKEGDSIAVILFSGVHFYTGQLFNIPAITKAGQAKGCFVGFDLAHAVGNVELHLHDWGVDFACWCSYKYLNSGAGGLAGAFVHEKHAYTIKPALVGWFGHELSTRFKMDNKLQLIPGVNGFRISNPPILLVCSLHASLEIFKQATMKALRRKSILLTGYLEYLIKHYYSKDKAETKKPIVNIITPSRIEERGCQLTLTFSVPMKYVFQELEKRGVVCDKREPNGIRVAPVPLYNSFHDVYKFIELLTSVLDSAETK')]

for batch_seqs in batch(topredict, 1):
  batch_labels, batch_strs, batch_tokens = batch_converter(batch_seqs)
  # build embeddings
  batch_tokens_cuda = batch_tokens.to(device="cuda", non_blocking=True)

  with torch.no_grad():
    results = model(batch_tokens_cuda, repr_layers=[34])
  token_embeddings = results["representations"][34]

  # Generate per-sequence embeddings via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_embeddings = []
  for i, (_, seq) in enumerate(batch_seqs):
    sequence_embeddings.append(token_embeddings[i, 1:len(seq) + 1].mean(0))

  predict_seqs_embeddings=[t.cpu().data.numpy() for t in sequence_embeddings]
  preds=[]
  for grid in grid_list:
    pred = grid.predict(predict_seqs_embeddings)
    preds.append(pred)
  for i in range(0, len(batch_seqs)):
    #f.write("{} {} {} {}\n".format(batch_seqs[i][0], preds[0][i],  preds[1][i],  preds[2][i]))
    print(batch_seqs[i][0], preds[0][i], preds[1][i], preds[2][i])

# PART 2: Building embeddings (completely new data) & feed it into the prediction model

In [None]:
METAGENOME_FASTA_PATH="/path/to/fasta/to/predict"


topredict = []
with open('/output/file', 'w') as f:
  for header, sequence in esm.data.read_fasta(METAGENOME_FASTA_PATH):
    topredict.append((header, sequence))

  for batch_seqs in batch(topredict, 1):
    batch_labels, batch_strs, batch_tokens = batch_converter(batch_seqs)
    # build embeddings
    batch_tokens_cuda = batch_tokens.to(device="cuda", non_blocking=True)

    with torch.no_grad():
      results = model(batch_tokens_cuda, repr_layers=[34])
    token_embeddings = results["representations"][34]

    # Generate per-sequence embeddings via averaging
    # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
    sequence_embeddings = []
    for i, (_, seq) in enumerate(batch_seqs):
      sequence_embeddings.append(token_embeddings[i, 1:len(seq) + 1].mean(0))

    predict_seqs_embeddings=[t.cpu().data.numpy() for t in sequence_embeddings]
    preds=[]
    for grid in grid_list:
      pred = grid.predict(predict_seqs_embeddings)
      preds.append(pred)
    for i in range(0, len(batch_seqs)):
      f.write("{} {} {} {}\n".format(batch_seqs[i][0], preds[0][i],  preds[1][i],  preds[2][i]))

# PART 3: Emptying cuda cache

In [None]:

torch.cuda.empty_cache()