## Protein Embeddings Manipulation

In [None]:
from pathlib import Path

data_path = Path("../data/Protera") 

!ls $data_path

In [None]:
from pathlib import Path
import matplotlib.pyplot as plt

from protera_stability.data import ProteinStabilityDataset

### Get Dataset

In [None]:
train_set = ProteinStabilityDataset(proteins_path=data_path / "stability_train.h5")
len(train_set)

In [None]:
train_set.X.mean(), train_set.X.var(), train_set.X.shape

In [None]:
train_set.y.mean(), train_set.y.var(), train_set.y.shape

In [None]:
plt.hist(train_set.y, bins=50)
plt.show()

## Dimensionality Reduction

In [None]:
from protera_stability.utils.decomposition import dim_reduction

X = dim_reduction(train_set.X, train_set.y, n_components=32, plot_viz=True, prefix="stability")

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from protera_stability.utils.decomposition import dim_reduction
X_train, X_valid, y_train, y_valid = train_test_split(X, train_set.y, random_state=123)

scoring = "r2"
score = r2_score
models = {
    "RandomForest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [30, 50, 100],
            "max_depth": [None, 100],
            "criterion": ["mse", "mae"]
        },
    },
    "SVR": {
        "model": SVR(),
        "params": {
            "C": [0.1, 1.0, 10.0],
            "kernel": ["linear", "poly", "rbf", "sigmoid"],
            "degree": [3],
            "gamma": ["scale"],
        },
    },
    "MLP": {
        "model": MLPRegressor(),
        "params": {
            "hidden_layer_sizes": [(100,), (100, 100), (1024, 1024)],
            "activation": ["tanh", "relu"],
            "solver": ["sgd", "adam"],
        },
    },
}

In [None]:
from protera_stability.utils import perform_search

for name, model in models.items():
    perform_search(
        X_train,
        y_train,
        model["model"],
        model["params"],
        name,
        X_test=X_valid,
        y_test=y_valid,
        save_dir="../models",
        n_jobs=8,
    )

In [None]:
model = SVR(**{'C': 10.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'})

# model.fit(X_train, y_train)
# model.score(X_test, test_set.y)

In [None]:
model = MLPRegressor(**{'activation': 'relu', 'hidden_layer_sizes': (1024, 1024), 'solver': 'adam'})

# model.fit(X_train, y_train)
# model.score(X_test, test_set.y)