# Example script for Hackathon

Within each cycle of active learning, you can:

1. Collect training data (original training data + your query data).

2. Train a prediction model to predict the DMS_score for each mutant (e.g., M0A).

3. Use the trained model to predict the score for all mutant in the test set.

4. Select query mutants for next round based on certain criteria. You may want to make sure you don't query the same mutant twice as you only have a limited chances of making queries in total.

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, Dataset
import random
from copy import deepcopy
import pandas as pd
from scipy.stats import spearmanr
import argparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## 1. collect training data

Upload `sequence.fasta`, `train.csv`, and `test.csv` to the current runtime:

1. click the folder icon on the left

2. click the upload icon and upload the files to the current directory

In [4]:
with open('sequence.fasta', 'r') as f:
  data = f.readlines()

sequence_wt = data[1].strip()
sequence_wt

'MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLREKMRRRLESGDKWFSLEFFPPRTAEGAVNLISRFDRMAAGGPLYIDVTWHPAGDPGSDKETSSMMIASTAVNYCGLETILHMTCCRQRLEEITGHLHKAKQLGLKNIMALRGDPIGDQWEEEEGGFNYAVDLVKHIRSEFGDYFDICVAGYPKGHPEAGSFEADLKHLKEKVSAGADFIITQLFFEADTFFRFVKACTDMGITCPIVPGIFPIQGYHSLRQLVKLSKLEVPQEIKDVIEPIKDNDAAIRNYGIELAVSLCQELLASGLVPGLHFYTLNREMATTEVLKRLGMWTEDPRRPLPWALSAHPKRREEDVRPIFWASRPKSYIYRTQEWDEFPNGRWGNSSSPAFGELKDYYLFYLKSKSPKEELLKMWGEELTSEESVFEVFVLYLSGEPNRNGHKVTCLPWNDEPLAAETSLLKEELLRVNRQGILTINSQPNINGKPSSDPIVGWGPSGGYVFQKAYLEFFTSRETAEALLQVLKKYELRVNYHLVNVKGENITNAPELQPNAVTWGIFPGREIIQPTVVDPVSFMFWKDEAFALWIERWGKLYEEESPSRTIIQYIHDNYFLVNLVDNDFPLDNCLWQVVEDTLELLNRPTQNARETEAP'

In [5]:
len(sequence_wt)

656

In [6]:
def get_mutated_sequence(mut, sequence_wt):
  wt, pos, mt = mut[0], int(mut[1:-1]), mut[-1]

  sequence = deepcopy(sequence_wt)

  return sequence[:pos]+mt+sequence[pos+1:]

In [7]:
df_train = pd.read_csv('train.csv')
df_train['sequence'] = df_train.mutant.apply(lambda x: get_mutated_sequence(x, sequence_wt))
df_train

Unnamed: 0,mutant,DMS_score,sequence
0,M0Y,0.2730,YVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,M0W,0.2857,WVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,M0V,0.2153,VVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,M0T,0.3122,TVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
4,M0S,0.2180,SVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
...,...,...,...
1135,P347D,0.3876,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1136,P347C,0.1837,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1137,P347A,0.4611,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1138,P347M,0.2412,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


In [8]:
df_test = pd.read_csv('test.csv')
df_test['sequence'] = df_test.mutant.apply(lambda x: get_mutated_sequence(x, sequence_wt))
df_test

Unnamed: 0,mutant,sequence
0,V1D,MDNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,V1Y,MYNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,V1C,MCNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,V1A,MANEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
4,V1E,MENEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
...,...,...
11319,P655S,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
11320,P655T,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
11321,P655V,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
11322,P655A,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


In [9]:
# TODO: integrate the query data that you acquired each round into df_train
df_query_1 = pd.read_csv('query_1.csv')
df_query_1['sequence'] = df_query_1.mutant.apply(lambda x: get_mutated_sequence(x, sequence_wt))
df_query_1

Unnamed: 0,mutant,DMS_score,sequence
0,K355Q,0.913747,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,R356P,0.574405,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,R357G,0.378047,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,F365E,0.468621,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
4,S372M,0.640720,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
...,...,...,...
95,L639F,0.803069,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
96,L642P,0.783097,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
97,Q647V,0.882989,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
98,R650Y,0.792312,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


In [10]:
df_query_2 = pd.read_csv('query_2.csv')
df_query_2['sequence'] = df_query_2.mutant.apply(lambda x: get_mutated_sequence(x, sequence_wt))
df_query_2
#

Unnamed: 0,mutant,DMS_score,sequence
0,G6F,0.796447,MVNEARFNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,S9E,0.697405,MVNEARGNSELNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,N11M,0.805347,MVNEARGNSSLMPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,G16R,0.738921,MVNEARGNSSLNPCLERSASSGSESSKDSSRCSTPGLDPERHERLR...
4,S19P,0.915120,MVNEARGNSSLNPCLEGSAPSGSESSKDSSRCSTPGLDPERHERLR...
...,...,...,...
95,T234Y,0.508769,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
96,F235K,0.127397,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
97,F235I,0.035689,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
98,R237H,0.937612,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


In [11]:
df_query_3 = pd.read_csv('query_3.csv')
df_query_3['sequence'] = df_query_3.mutant.apply(lambda x: get_mutated_sequence(x, sequence_wt))
df_query_3
#

Unnamed: 0,mutant,DMS_score,sequence
0,L350E,0.737047,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,S351D,0.738005,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,H353Q,0.836147,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,H353K,0.699721,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
4,R357P,0.681220,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
...,...,...,...
95,T638R,0.412769,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
96,E640A,0.966497,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
97,L641C,0.823889,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
98,T646F,0.934512,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


In [12]:
df_train = pd.concat([df_train, df_query_1, df_query_2, df_query_3])
df_train

Unnamed: 0,mutant,DMS_score,sequence
0,M0Y,0.273000,YVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
1,M0W,0.285700,WVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
2,M0V,0.215300,VVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
3,M0T,0.312200,TVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
4,M0S,0.218000,SVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
...,...,...,...
95,T638R,0.412769,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
96,E640A,0.966497,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
97,L641C,0.823889,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...
98,T646F,0.934512,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...


## 2. Train a prediction model

Here, we provided a linear regression model and used one-hot encoding to encode each variant. You would need to build your own model to achieve better performances.

Hint: you can perform cross-validation on the training set to evaluate your predictor before making predictions on the test set.

In [13]:
'''hyperparameters'''

seq_length = 656
seed = 100 # seed for splitting the validation set
val_ratio = 0.3 # proportion of validation set

In [14]:
!pip install -q transformers
!pip install -q accelerate
!pip install -q torch torchvision torchaudio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [15]:
class ProteinDataset(Dataset):
    def __init__(self, df, istrain=True):

        alphabet = 'ACDEFGHIKLMNPQRSTVWY'
        map_a2i = {j:i for i,j in enumerate(alphabet)}
        map_i2a = {i:j for i,j in enumerate(alphabet)}

        self.df = df

        self.num_samples = len(self.df)
        self.seq_length = len(self.df.sequence.values[0])
        self.num_channels = 20

        # TODO: replace one-hot encodings with your own encodings
        self.encodings = np.zeros((self.num_samples, self.num_channels, self.seq_length)).astype(np.float32)
        self.targets = np.zeros(self.num_samples).astype(np.float32)

        if istrain:
          for it, (seq,target) in enumerate(self.df[['sequence', 'DMS_score']].values):
              for i,aa in enumerate(seq):
                  self.encodings[it,map_a2i[aa],i] = 1
              self.targets[it] = target

          self.encodings = self.encodings.astype(np.float32)
          self.targets = self.targets.astype(np.float32)
        else:
          for it, seq in enumerate(self.df['sequence'].values):
              for i,aa in enumerate(seq):
                  self.encodings[it,map_a2i[aa],i] = 1

          self.encodings = self.encodings.astype(np.float32)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return torch.tensor(self.encodings[idx]), torch.tensor(self.targets[idx])

In [18]:
train_dataset = ProteinDataset(df_train)
test_dataset = ProteinDataset(df_test, istrain=False)

# split validation set
train_dataset, val_dataset = train_test_split(train_dataset, test_size=val_ratio, random_state=seed, shuffle=True)

# TODO: revise according to your own model
train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

In [19]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
import torch
import numpy as np

# Define custom Spearman correlation scorer for RandomizedSearchCV
def spearman_scorer(y_true, y_pred):
    return spearmanr(y_true, y_pred)[0]  # Return only the correlation value

# Prepare data for RandomizedSearchCV
X_train, y_train = next(iter(train_loader))
X_train = X_train.view(X_train.size(0), -1).detach().cpu().numpy()
y_train = y_train.detach().cpu().numpy()

X_val, y_val = next(iter(val_loader))
X_val = X_val.view(X_val.size(0), -1).detach().cpu().numpy()
y_val = y_val.detach().cpu().numpy()

# Define parameter distribution for RandomizedSearchCV
param_dist = {
    'learning_rate': [0.005, 0.01, 0.02, 0.03, 0.05],
    'max_depth': [5, 10, 15, 20],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 0.6],
    'n_estimators': [500, 1000, 1500],
    'gamma': [0.07, 0.1, 0.2, 0.3, 0.4],
    'reg_lambda': [0.1, 0.5, 1.0, 2.0]
}

# Set up the XGBRegressor
xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    random_state=100,
    eval_metric="rmse"  # Default evaluation metric for RandomizedSearchCV
)

# Set up RandomizedSearchCV with Spearman correlation scorer
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    scoring=make_scorer(spearman_scorer),  # Use Spearman as evaluation metric
    n_iter=30,  # Number of random combinations to try
    cv=3,  # 3-fold cross-validation
    verbose=1,
    n_jobs=-1,  # Use all available cores
    random_state=100
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters found by RandomizedSearchCV:")
print(random_search.best_params_)

# Get the best model from RandomizedSearchCV
best_xgb_model = random_search.best_estimator_

# Train the best model with the full training data
best_xgb_model.fit(X_train, y_train)

# Predict on the validation set to check Spearman correlation
y_val_pred = best_xgb_model.predict(X_val)
spearman_corr_val = spearmanr(y_val, y_val_pred)[0]
print("Spearman correlation on validation set: ", spearman_corr_val)

# Extract test data
X_test, _ = next(iter(test_loader))
X_test = X_test.view(X_test.size(0), -1).detach().cpu().numpy()

# Predict on the test set
y_test_pred = best_xgb_model.predict(X_test)


Fitting 3 folds for each of 30 candidates, totalling 90 fits




Best hyperparameters found by RandomizedSearchCV:
{'subsample': 0.7, 'reg_lambda': 1.0, 'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.05, 'gamma': 0.07, 'colsample_bytree': 0.8}
Spearman correlation on validation set:  0.6293070957451189


In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
import torch
import numpy as np

# Define custom Spearman correlation scorer for RandomizedSearchCV
def spearman_scorer(y_true, y_pred):
    return spearmanr(y_true, y_pred)[0]  # Return only the correlation value
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
# Prepare data for RandomizedSearchCV
X_train, y_train = next(iter(train_loader))
X_train = X_train.view(X_train.size(0), -1).detach().cpu().numpy()
y_train = y_train.detach().cpu().numpy()

X_val, y_val = next(iter(val_loader))
X_val = X_val.view(X_val.size(0), -1).detach().cpu().numpy()
y_val = y_val.detach().cpu().numpy()

# Define parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 6, 10, 15, 20, 25, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Set up the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Set up RandomizedSearchCV with Spearman correlation scorer
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    scoring=make_scorer(spearman_scorer),  # Use Spearman as evaluation metric
    n_iter=20,  # Number of random combinations to try
    cv=5,  # 3-fold cross-validation
    verbose=1,
    n_jobs=-1,  # Use all available cores
    random_state=100
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters found by RandomizedSearchCV:")
print(random_search.best_params_)

# Get the best model from RandomizedSearchCV
best_rf_model = random_search.best_estimator_

# Train the best model with the full training data
best_rf_model.fit(X_train, y_train)

# Predict on the validation set to check Spearman correlation
y_val_pred = best_rf_model.predict(X_val)
spearman_corr_val = spearmanr(y_val, y_val_pred)[0]
print("Spearman correlation on validation set: ", spearman_corr_val)

# Extract test data
X_test, _ = next(iter(test_loader))
X_test = X_test.view(X_test.size(0), -1).detach().cpu().numpy()

# Predict on the test set
y_test_pred = best_rf_model.predict(X_test)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


35 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
19 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skle

Best hyperparameters found by RandomizedSearchCV:
{'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 25, 'bootstrap': True}
Spearman correlation on validation set:  0.5649603991231251


In [20]:
df_test['DMS_score_predicted'] = y_test_pred
df_test

Unnamed: 0,mutant,sequence,DMS_score_predicted
0,V1D,MDNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.57885
1,V1Y,MYNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.57885
2,V1C,MCNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.57885
3,V1A,MANEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.57885
4,V1E,MENEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.57885
...,...,...,...
11319,P655S,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.57885
11320,P655T,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.57885
11321,P655V,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.57885
11322,P655A,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.57885


In [21]:
df_top10 = df_test.nlargest(10, 'DMS_score_predicted')
df_top10


Unnamed: 0,mutant,sequence,DMS_score_predicted
7790,L470E,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7791,L470A,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7792,L470C,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7793,L470Y,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7794,L470W,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7795,L470D,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7796,L470T,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7797,L470S,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7798,L470R,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7799,L470Q,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931


In [22]:
df_test[['mutant', 'DMS_score_predicted']].to_csv('predictions.csv')

In [23]:
df_top10[['mutant', 'DMS_score_predicted']].to_csv('top10_predictions.csv')

## 3. Select query for next round

In [24]:
df_test.sort_values('DMS_score_predicted', ascending=False).head(100)

Unnamed: 0,mutant,sequence,DMS_score_predicted
7808,L470I,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7800,L470V,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7799,L470Q,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7807,L470M,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
7793,L470Y,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.797931
...,...,...,...
6467,K400W,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.766237
6466,K400P,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.766237
6477,K400A,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.766237
6461,K400V,MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,0.766237


In [None]:
# Example: randomly select 100 test variants to be queried.
# Note: random selection may not be a good strategy
# TODO: select query mutants for the next round based on your own criteria

querys = np.random.choice(df_test.mutant.values, size=100, replace=False)
querys


array(['E636A', 'E184P', 'E545W', 'I364V', 'F625I', 'R81S', 'S25N',
       'N614K', 'L10P', 'D91G', 'N486P', 'K596A', 'Y531I', 'C630A',
       'F206M', 'S24N', 'P564S', 'S409M', 'R30K', 'Q476P', 'P394G',
       'W380K', 'L553I', 'Y505R', 'D401T', 'M337D', 'K413W', 'E284F',
       'S303F', 'N643R', 'R67K', 'I496V', 'E522T', 'Y188H', 'K408E',
       'A586I', 'D360P', 'E601A', 'D288K', 'R67C', 'D244Y', 'E340I',
       'G498A', 'L407Y', 'L424F', 'D624V', 'Y437L', 'E600T', 'K280P',
       'F396V', 'T480V', 'E325T', 'G544W', 'T559N', 'D575K', 'L313C',
       'A18F', 'K47M', 'A654R', 'S113L', 'L300D', 'E545M', 'I496T',
       'T68F', 'E122V', 'M127Y', 'T321A', 'G35N', 'T128Y', 'F230K',
       'A588W', 'V239Y', 'I111L', 'I478F', 'H41V', 'G565S', 'S517Q',
       'E567F', 'P370F', 'T450Y', 'P370I', 'I611C', 'S517T', 'S102Y',
       'F229E', 'L52S', 'E428S', 'E552T', 'S32G', 'D80C', 'E53N', 'E231C',
       'I591T', 'D233A', 'L438S', 'Y373S', 'H140S', 'L213P', 'E636I',
       'G86C'], dtype=object

In [None]:
df_test['residue'] = df_test['mutant'].str.extract(r'(\d+)').astype(int)
filtered_df = df_test[(df_test['residue'] >= 350) & (df_test['residue'] <= 656)]
querys = np.random.choice(filtered_df.mutant.values, size=100, replace=False)
querys

array(['L416M', 'G544W', 'A586R', 'S602F', 'Q647V', 'E428Q', 'V540Y',
       'N629W', 'R650Y', 'R605T', 'V574Q', 'P571Q', 'K530E', 'L407P',
       'S502P', 'I478N', 'L416Y', 'F383P', 'T450P', 'N623W', 'K596I',
       'G500W', 'P412M', 'E600F', 'F405N', 'I487L', 'F431P', 'K543Y',
       'K448A', 'N385R', 'R356P', 'K448I', 'K400I', 'D584R', 'P655G',
       'K418C', 'P458T', 'T480F', 'V618H', 'T463N', 'P458Q', 'E414L',
       'G489N', 'G440I', 'S393I', 'G503Y', 'E415T', 'K410C', 'A523I',
       'P501C', 'K400A', 'I591P', 'D575A', 'W590D', 'D584M', 'N390T',
       'E513H', 'L512R', 'L639F', 'A588N', 'L553E', 'L470V', 'Q633L',
       'R357G', 'T450C', 'W388P', 'E585Q', 'L512Y', 'E552F', 'E422H',
       'V618N', 'S372M', 'I611M', 'K355Q', 'Y373G', 'G500E', 'E441L',
       'F434S', 'Y437T', 'W560K', 'Y505K', 'D613V', 'K530C', 'L642P',
       'E423T', 'K543G', 'G386Y', 'I608L', 'D401K', 'S464V', 'Y610I',
       'C451S', 'W380N', 'Y406F', 'A510P', 'A460P', 'G386S', 'F365E',
       'A588R', 'P48

In [None]:
with open('query.txt', 'w') as f:
  for mutant in querys:
    f.write(mutant+'\n')