In [10]:
from pathlib import Path
data_path = Path("../data") / "Protera"
per_protein_csv = [pth for pth in (data_path / "prism").iterdir()]

print(per_protein_csv[:10])

[PosixPath('../data/Protera/prism/prism_merged_037_UBI4_E1_binding_limiting_E1.csv'), PosixPath('../data/Protera/prism/prism_merged_999_IF-1_DMS.csv'), PosixPath('../data/Protera/prism/prism_merged_021_PAB1_doxycyclin_sensitivity.csv'), PosixPath('../data/Protera/prism/prism_merged_006_CBS_high_B6_activity.csv'), PosixPath('../data/Protera/prism/prism_merged_003_PTEN_abundance.csv'), PosixPath('../data/Protera/prism/prism_merged_999_GmR_DMS.csv'), PosixPath('../data/Protera/prism/prism_merged_027_Src_kinase_activity_catalytic_domain_reversed.csv'), PosixPath('../data/Protera/prism/prism_merged_999_ccdB_DMS.csv'), PosixPath('../data/Protera/prism/prism_merged_026_BRCA1_E3_ubiquitination_activity.csv'), PosixPath('../data/Protera/prism/prism_merged_999_HAh1n1_DMS.csv')]


## Choose a Protein

In [11]:
import pandas as pd

max_ = 0
for path in per_protein_csv:
    df = pd.read_csv(path)
    df = df[df["variant"].str.len() < 1000]
    size = len(df)
    if size > max_:
        max_ = size
        path_ = path

path_, max_

(PosixPath('../data/Protera/prism/prism_merged_030_HMGCR_yeast_complementation_control_medium.csv'),
 16872)

## Preprocessing Input

In [12]:
from protera_stability.data import ProteinStabilityDataset, EmbeddingGetter
from protera_stability.proteins import EmbeddingExtractor1D

# from torch.multiprocessing import set_start_method
# try:
#      set_start_method('spawn')
# except RuntimeError:
#     pass


df = pd.read_csv(path_)
df = df.drop_duplicates().dropna()
df.head()

Unnamed: 0,variant,score_00,se_00,gemme_score_01,Rosetta_ddg_score_02,ss_03,rsa_03
8322,PREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETL...,0.7673,0.0258,0.861834,1.154483,E,0.117647
8323,PREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETL...,0.7867,0.0074,0.866576,2.237931,E,0.117647
8324,PREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETL...,0.3132,0.136,0.860471,1.295517,E,0.117647
8325,PREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETL...,0.6847,0.0076,0.878845,1.792069,E,0.117647
8326,PREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETL...,0.8478,0.0029,0.607551,2.431034,E,0.117647


In [13]:
data = df[["variant", "Rosetta_ddg_score_02"]]
data.columns = ["sequences", "labels"]
data = data[data["sequences"].str.len() < 1000]
data = data[data.columns[::-1]]
data.head()

Unnamed: 0,labels,sequences
8322,1.154483,PREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETL...
8323,2.237931,PREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETL...
8324,1.295517,PREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETL...
8325,1.792069,PREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETL...
8326,2.431034,PREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETL...


## Extract Embeddings

In [None]:
args_dict = {
    "model_name": "esm1b_t33_650M_UR50S",
    "base_path": data_path,
    "gpu": True,
}
emb_extractor = EmbeddingExtractor1D(**args_dict)
dset = emb_extractor.generate_datasets(
    [""],
    data=data,
    h5_stem=f"stability_{path_.stem}",  # data_path / "stability_train".h5
    bs=1,
    target_name="stability_scores"
)

dataset = ProteinStabilityDataset(data_path / f"stability_{path_.stem}.h5",  ret_dict=False)
len(dataset)

## Sklearn approach

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from protera_stability.utils.decomposition import dim_reduction

In [None]:
X = dim_reduction(dataset.X, dataset.y, n_components=16, plot_viz=False)
y = dataset.y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=43)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.33, random_state=43)

In [None]:
scoring = "r2"
score = r2_score
models = {
    "RandomForest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 100],
            "criterion": ["mse", "mae"]
        },
    },
    "SVR": {
        "model": SVR(),
        "params": {
            "C": [0.1, 1.0, 10.0],
            "kernel": ["linear", "poly", "rbf", "sigmoid"],
            "degree": [3],
            "gamma": ["scale"],
        },
    },
    "MLP": {
        "model": MLPRegressor(),
        "params": {
            "hidden_layer_sizes": [(100,), (100, 100), (1024, 1024)],
            "activation": ["tanh", "relu"],
            "solver": ["sgd", "adam"],
        },
    },
}

In [None]:
from protera_stability.utils import perform_search

for name, model in models.items():
    perform_search(
        X_train,
        y_train,
        model["model"],
        model["params"],
        name,
        X_test=X_valid,
        y_test=y_valid,
        save_dir="../models",
        n_jobs=8,
    )

## NN-based approach

In [None]:
import torch
from protera_stability.config.lazy import LazyCall as L
from protera_stability.config.common.mlp import mlp_esm
from protera_stability.train import get_cfg, setup_diversity, setup_data

exp_params = {
    "diversity_cutoff": 0.,
    "random_percent": 1.,
    "sampling_method": "",
    "experiment_name": "per_prot",
}

def create_cfg(exp_params):
    cfg = get_cfg(args={})
    cfg = setup_diversity(cfg, **exp_params)
    mlp_esm.n_units = 1024
    mlp_esm.n_layers = 1
    mlp_esm.act = L(torch.nn.GELU)()
    cfg.model = mlp_esm

    cfg = setup_data(cfg, dataset=dataset)
    return cfg

In [None]:
cfg = create_cfg(exp_params)
cfg.trainer_params.gpus = 1
cfg.keys()

In [None]:
from protera_stability.engine.default import DefaultTrainer

cfg.dataloader.train.dataset = dataset
# cfg.dataloader.test.dataset = dataset

trainer = DefaultTrainer(cfg)

In [None]:
trainer.fit()

In [None]:
#TODO: prediction on a test set
trainer.model()