In [4]:
from pathlib import Path
data_path = Path("../data") / "Protera"
per_protein_csv = [pth for pth in (data_path / "prism").iterdir()]

print(per_protein_csv[:10])

[PosixPath('../data/Protera/prism/prism_merged_037_UBI4_E1_binding_limiting_E1.csv'), PosixPath('../data/Protera/prism/prism_merged_999_IF-1_DMS.csv'), PosixPath('../data/Protera/prism/prism_merged_021_PAB1_doxycyclin_sensitivity.csv'), PosixPath('../data/Protera/prism/prism_merged_006_CBS_high_B6_activity.csv'), PosixPath('../data/Protera/prism/prism_merged_003_PTEN_abundance.csv'), PosixPath('../data/Protera/prism/prism_merged_999_GmR_DMS.csv'), PosixPath('../data/Protera/prism/prism_merged_027_Src_kinase_activity_catalytic_domain_reversed.csv'), PosixPath('../data/Protera/prism/prism_merged_999_ccdB_DMS.csv'), PosixPath('../data/Protera/prism/prism_merged_026_BRCA1_E3_ubiquitination_activity.csv'), PosixPath('../data/Protera/prism/prism_merged_999_HAh1n1_DMS.csv')]


In [5]:
import pandas as pd

max_ = 0
for path in per_protein_csv:
    df = pd.read_csv(path)
    df = df[df["variant"].str.len() < 1000]
    size = len(df)
    if size > max_:
        max_ = size
        path_ = path

path_, max_

(PosixPath('../data/Protera/prism/prism_merged_030_HMGCR_yeast_complementation_control_medium.csv'),
 16872)

In [6]:
from protera_stability.data import ProteinStabilityDataset, EmbeddingGetter
from protera_stability.proteins import EmbeddingExtractor1D

# from torch.multiprocessing import set_start_method
# try:
#      set_start_method('spawn')
# except RuntimeError:
#     pass


df = pd.read_csv(path_)
df = df.drop_duplicates().dropna()
df = df[["variant", "Rosetta_ddg_score_02"]]
df.columns = ["sequences", "labels"]
df = df[df["sequences"].str.len() < 1000]
df = df[df.columns[::-1]]

args_dict = {
    "model_name": "esm1b_t33_650M_UR50S",
    "base_path": data_path,
    "gpu": True,
}
emb_extractor = EmbeddingExtractor1D(**args_dict)
dset = emb_extractor.generate_datasets(
    [""],
    data=df,
    h5_stem=f"stability_{path_.stem}",  # data_path / "stability_train".h5
    bs=1,
    target_name="stability_scores"
)

dataset = ProteinStabilityDataset(data_path / f"stability_{path_.stem}.h5",  ret_dict=False)
len(dataset)

Using cache found in /home/roberto/.cache/torch/hub/facebookresearch_esm_master
Returning existing dataset...


7771

## Sklearn approach

In [9]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from protera_stability.utils.decomposition import dim_reduction

In [10]:
X = dim_reduction(dataset.X, dataset.y, n_components=16, plot_viz=False)
y = dataset.y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=43)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.33, random_state=43)

In [11]:
scoring = "r2"
score = r2_score
models = {
    "RandomForest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 100],
            "criterion": ["mse", "mae"]
        },
    },
    "SVR": {
        "model": SVR(),
        "params": {
            "C": [0.1, 1.0, 10.0],
            "kernel": ["linear", "poly", "rbf", "sigmoid"],
            "degree": [3],
            "gamma": ["scale"],
        },
    },
    "MLP": {
        "model": MLPRegressor(),
        "params": {
            "hidden_layer_sizes": [(100,), (100, 100), (1024, 1024)],
            "activation": ["tanh", "relu"],
            "solver": ["sgd", "adam"],
        },
    },
}

In [12]:
from protera_stability.utils import perform_search

for name, model in models.items():
    perform_search(
        X_train,
        y_train,
        model["model"],
        model["params"],
        name,
        X_test=X_valid,
        y_test=y_valid,
        save_dir="../models",
        n_jobs=8,
    )

Fitting model RandomForest...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
RandomForest best R2: 0.509007163999039
Best params: {'criterion': 'mse', 'max_depth': 100, 'n_estimators': 200}
Test R2: 0.4943190868741618
Fitting model SVR...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
SVR best R2: 0.3499447777543461
Best params: {'C': 10.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
Test R2: 0.3754357331424272
Fitting model MLP...
Fitting 5 folds for each of 12 candidates, totalling 60 fits




MLP best R2: 0.4611450406497564
Best params: {'activation': 'tanh', 'hidden_layer_sizes': (100,), 'solver': 'adam'}
Test R2: 0.4193734869332113


## NN-based approach

In [7]:
import torch
from protera_stability.config.lazy import LazyCall as L
from protera_stability.config.common.mlp import mlp_esm
from protera_stability.train import get_cfg, setup_diversity, setup_data

exp_params = {
    "diversity_cutoff": 0.,
    "random_percent": 1.,
    "sampling_method": "",
    "experiment_name": "per_prot",
}

def create_cfg(exp_params):
    cfg = get_cfg(args={})
    cfg = setup_diversity(cfg, **exp_params)
    mlp_esm.n_units = 1024
    mlp_esm.n_layers = 1
    mlp_esm.act = L(torch.nn.GELU)()
    cfg.model = mlp_esm

    cfg = setup_data(cfg, dataset=dataset)
    return cfg

In [8]:
cfg = create_cfg(exp_params)
cfg.trainer_params.gpus = 1
cfg.keys()

dict_keys(['trainer_params', 'output_dir', 'random_split', 'experiment', 'model', 'dataloader'])

In [9]:
from protera_stability.engine.default import DefaultTrainer

cfg.dataloader.train.dataset = dataset
# cfg.dataloader.test.dataset = dataset

trainer = DefaultTrainer(cfg)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [10]:
trainer.fit()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name     | Type       | Params
----------------------------------------
0 | model    | ProteinMLP | 1.3 K 
1 | train_r2 | R2Score    | 0     
2 | valid_r2 | R2Score    | 0     
3 | test_r2  | R2Score    | 0     
----------------------------------------
1.3 K     Trainable params
0         Non-trainable params
1.3 K     Total params
0.005     Total estimated model params size (MB)


Epoch 81: 100%|██████████| 25/25 [00:00<00:00, 41.41it/s, loss=0.578, v_num=27, train/r2=0.438, train/loss=0.573, valid/r2=0.413, valid/loss=0.493]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


<protera_stability.engine.default.DefaultTrainer at 0x7fd9259b69d0>

In [11]:
#TODO: prediction on a test set
trainer.model()

ProteinMLP(
  (layers): ModuleList(
    (0): Linear(in_features=1280, out_features=1, bias=True)
  )
  (drop): Dropout(p=0.7, inplace=False)
  (act): GELU()
)